#!/usr/bin/python #coding:utf-8 import scrapy class ZhihudailySpider(scrapy.spider.Spider): name='zhihudaily' allowd_domains=['zhihu.com'] start_urls=[ "http://zhihudaily.ahorn.me/page/1"] def parse(self,response): for sel in response.xpath("//div[@class='post']"): for sub in sel.xpath("./div/div"): url=sub.xpath("./a/@href").extract()[0] yield scrapy.Request(url,callback=self.parse_url) for page in range(2,500): request=scrapy.Request("http://zhihudaily.ahorn.me/page/"+str(page),callback=self.parse) yield request def parse_url(self,response): title=response.xpath("//h1[@class='headline-title']/text()").extract()[0] print "标题:",title print "*************************************************************************" for p in response.xpath("//div[@class='content']/p/text()").extract(): print p
转载于:https://www.cnblogs.com/tmyyss/p/4551974.html
相关资源:数据结构—成绩单生成器