豆瓣影评爬取（以深海浩劫为例）

it2022-05-05 98

代码示例：

spider：

#import scrapyfrom faker import Factoryimport urllib.parsefrom dou_movie.items import DouMovieItemf=Factory.create()class DouSpider(scrapy.Spider): name='movie' start_urls=['https://www.douban.com/'] headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate, br', 'Accept-Language':'zh-CN,zh;q=0.9', 'Connection':'keep-alive', 'User-Agent':f.user_agent()} formdata={'form_email':'邮箱', 'form_password':'你的密码', 'login':'登录', 'redir':'https://www.douban.com/','source':'None'} def start_requests(self): yield scrapy.Request(url='https://accounts.douban.com/login', headers=self.headers, meta={'cookiejar':1}, callback=self.parse_login) def parse_login(self,response): print(response.status) if ('captcha_image').encode('utf-8') in response.body: link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0] print(link) captcha_solution = input('captcha-solution:') captcha_id = urllib.parse.parse_qs(urllib.parse.urlparse(link).query, True)['id'] self.formdata['captcha-solution'] = captcha_solution self.formdata['captcha-id'] = captcha_id return [scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )] def after_login(self,response): print(response.status) self.headers['Host']='www.douban.com' yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url) yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter=True) def parse_next_page(self,response): print(response.status) try: next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0]) #print("下一页") print(next_url) yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url, dont_filter=True) yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter=True) except: print ("Next page Error") return def parse_comment_url(self,response): print(response.status) for item in response.xpath('//div[@class="review-list "]/div/div[@class="main review-item"]/div[@class="main-bd"]'): comment_url = item.xpath('./h2/a/@href').extract()[0] comment_title = item.xpath('./h2/a/text()').extract()[0] print(comment_title) print(comment_url) yield scrapy.Request(url=comment_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment) def parse_comment(self, response): print(response.status) for item in response.xpath('//div[@id="content"]'): comment = DouMovieItem() comment['useful'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract() comment['no_help_num']= item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract() comment['people']= item.xpath('//header[@class="main-hd"]/a[1]/span/text()').extract() comment['people_url']=item.xpath('//header[@class="main-hd"]/a[1]/@href').extract() comment['star']=item.xpath('//header[@class="main-hd"]/span[1]/text()').extract() return comment

转载于:https://www.cnblogs.com/zongdidi/p/10069814.html

专利

最新回复(0)