'''经历千辛万苦终于写成了自己的第一个爬虫,哇咔咔!
抓取糗事段子里面的段子,循环抓取多页
'''
import urllib.request
import ssl
import re
import pickle
weburl = "https://www.douban.com/"
#设置请求头
headers = {
'Accept': 'text/html, application/xhtml+xml, */*',
# 'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6756.400 QQBrowser/10.3.2545.400',
'DNT': '1',
'Connection': 'Keep-Alive',
'Host': 'www.qiushibaike.com'
}
def jokeCrawker(url): #创建为经过验证的上下文
context = ssl._create_unverified_context()
req = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(req, context=context)
#data = str(response.read())
data = response.read().decode("utf-8")
#. 不能匹配'\n',compole(pat, re.S),此处耽误了好久,无语了
pat = '<div class="author clearfix">(.*?)<span class="stats-vote"><i class="number">'
re_joke = re.compile(pat, re.S)
jokeList = re_joke.findall(data)
jokeDict = {}
for div in jokeList:
#name,名称
pat = r'<h2>(.*?)</h2>'
re_n = re.compile(pat, re.S)
name = re_n.findall(div)[0]
#words,说的段子
pat = '<div class="content">\n<span>(.*?)</span>'
re_w = re.compile(pat, re.S)
words = re_w.findall(div)[0]
#处理多个换行,<br/>,非(数字,字母,下划线,中文,以及中文标点符号)
pat = '\\n{2,}|<br/>|[^\d\u4e00-\u9fa5(\u3002|\uff1f|\uff01|\uff0c|\u3001|' \
'\uff1b|\uff1a|\u201c|\u201d|\u2018|\u2019|\uff08|\uff09|\u300a|' \
'\u300b|\u3008|\u3009|\u3010|\u3011|\u300e|\u300f|\u300c|\u300d|' \
'\ufe43|\ufe44|\u3014|\u3015|\u2026|\u2014|\uff5e|\ufe4f|\uffe5)*]'
word = re.sub(pat, '', words)
jokeDict[name] = word
with open("F:/糗事段子/file/qiushi.txt", "a+") as fp:
#还能同一存储到一个dict判断重复,然后在同一写入,就可以用'w',不过太占内存了
#以后还能索引,看是否覆盖(短时间内不会)
for k, v in jokeDict.items():
info = str(k + "说:" + v + "\n")
fp.write(info)
return True
for i in range(10):
webur2 = "https://www.qiushibaike.com/text/page/" + str(i) + "/"
jokeCrawker(webur2)
转载于:https://www.cnblogs.com/854594834-YT/p/10539711.html
转载请注明原文地址: https://win8.8miu.com/read-4907.html