# -*- coding: utf-8 -*-
import scrapy
from copy
import deepcopy
class SuSpider(scrapy.Spider):
name =
'su'
allowed_domains = [
'suning.com']
start_urls = [
'http://list.suning.com/?safp=d488778a.error1.0.4786e76351']
def parse(self, response):
# 获取大分类列表
bcate_list = response.xpath(
"//div[@class='allsortLeft']/ul/li")
for bcate
in bcate_list:
item =
{}
# 获取大分类class的值
class_name = bcate.xpath(
"./@class").extract_first()
# 获取所有大分类的名称
item[
"BCate"] = bcate.xpath(
"./a/span/text()").extract_first()
# print(item["BCate"])
# 根据大分类的class定位每个大分类下的所有小分类
scate_list = response.xpath(
"//div[@class='{}']/div".format(class_name))
for scate
in scate_list:
# 小分类的名称
item[
"SCate"] = scate.xpath(
"./div[1]/a/@title").extract_first()
# 获取每个小分类下的所有标签
tag_list = scate.xpath(
"./div[2]/a")
for tag
in tag_list:
# 每个标签的链接和名称
item[
"tag"] = tag.xpath(
"./text()").extract_first()
item["tag_link"] =
"http:" + tag.xpath(
"./@href").extract_first()
# 进入列表页
yield scrapy.Request(
item["tag_link"],
callback=
self.good_list,
meta={
"item": deepcopy(item)}
)
def good_list(self, response):
item = deepcopy(response.meta[
"item"])
# 获取当前页的所有商品列表
li_list = response.xpath(
"//div[@id='product-wrap']/div/ul/li")
for li
in li_list:
# 获取商品的图片地址,名称,价格,商品详情页的链接
item[
"good_img"] =
"http:"+li.xpath(
".//div[@class='res-img']/div/a/img/@src").extract_first()
item["good_name"] = li.xpath(
".//div[@class='res-info']/div/a/text()").extract_first()
item["good_price"] = li.xpath(
".//div[@class='res-info']/div/span/text()").extract_first()
item["good_href"] = li.xpath(
".//div[@class='res-info']/div/a/@href").extract_first()
# 进入商品详情页
if item[
"good_href"] !=
"javascript:void(0);":
yield scrapy.Request(
"http:"+item[
"good_href"],
callback=
self.good_detail,
meta={
"item": deepcopy(item)}
)
# 翻页
next_url = response.xpath(
"//a[@id='nextPage']/@href").extract_first()
if next_url:
yield scrapy.Request(
next_url,
callback=
self.good_list,
meta={
"item": response.meta[
"item"]}
)
def good_detail(self, response):
item = response.meta[
"item"]
# 获取当前商品的属性规格:颜色、版本、
size_list = response.xpath(
"//div[@id='J-TZM']/dl")
for size
in size_list:
size_name = size.xpath(
"./dt/span/text()").extract_first()
size_value = size.xpath(
"./dd/ul/li/@title").extract()
item[size_name] =
size_value
print(item)
View Code
转载于:https://www.cnblogs.com/ForT/p/10849396.html