'''爬取猫眼电影TOP100,并将其保存'''
from urllib
import request
import re
import csv
import time
import random
import os
class MaoyanSpider:
def __init__(self):
self.url =
'https://maoyan.com/board/4?offset={}'
self.ua_list =
[
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.835.163 Safari/535.1',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50',
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; GTB7.0)',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; ) AppleWebKit/534.12 (KHTML, like Gecko) Maxthon/3.0 Safari/534.12',
'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.33 Safari/534.3 SE 2.X MetaSr 1.0',
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/13.0.782.41 Safari/535.1 QQBrowser/6.9.11079.201'
]
# 获取页面
def get_page(self, url):
# 每次使用随机的User-Agent
headers = {
'User-Agent': random.choice(self.ua_list)}
req = request.Request(url=url, headers=
headers)
res =
request.urlopen(req)
html = res.read().decode(
'utf-8')
# 调用解析函数
self.parse_page(html)
# 解析页面
def parse_page(self, html):
pattern =
re.compile(
r'<div class="movie-item-info">.*?title="(.*?)".*?class="star">(.*?)</p>.*?class="releasetime">(.*?)</p>',
re.S)
r_list =
pattern.findall(html)
self.save_info(r_list)
# 打印页面
# def save_info(self, r_list):
# one_film_dict = {}
# for rt in r_list:
# one_film_dict['name'] = rt[0].strip()
# one_film_dict['stars'] = rt[1].strip()
# one_film_dict['time'] = rt[2].strip()
# print(one_film_dict)
# 保存页面
def save_info(self, r_list):
film_list =
[]
with open('./maoyan.csv',
'a', encoding=
'utf-8', newline=
'') as f:
# 方法一:一条一条写入csv
# for rt in r_list:
# writer = csv.writer(f)
# writer.writerow([rt[0].strip(), rt[1].strip(), rt[2].strip()])
# 方法二:一次性写入csv,减少IO
writer =
csv.writer(f)
for rt
in r_list:
# 把处理过的数据定义成元组
t = (rt[0].strip(), rt[1].strip(), rt[2
].strip())
film_list.append(t)
writer.writerows(film_list)
# 主函数
def main(self):
if os.path.exists(
'./maoyan.csv'):
os.remove('./maoyan.csv')
with open('./maoyan.csv',
'a', encoding=
'utf-8', newline=
'') as f:
writer =
csv.writer(f)
writer.writerow(['电影名称',
'主演',
'上映时间'])
i = 1
for offset
in range(0, 91, 10
):
url =
self.url.format(offset)
self.get_page(url)
print(
'第{}页成功下载'.format(i))
i += 1
# time.sleep(random.randint(1, 4))
if __name__ ==
'__main__':
start =
time.time()
spider =
MaoyanSpider()
spider.main()
end =
time.time()
print(
'程序执行时间为: %.2f' % (end - start))
转载于:https://www.cnblogs.com/yuxiangyang/p/11212544.html
相关资源:python3爬取猫眼电影排行代码