爬虫学习(五)——百度贴吧的爬取

it2022-05-05  157

import osimport timeimport urllib.requestimport urllib.parse# 输入目标页码和吧名def header(): url = "https://tieba.baidu.com/f?" baming = input("请输入要爬取的吧名") start_page = int(input("请输入起始页")) end_page = int(input("请输入结束页")) # 对目标页码进行爬取 for page in range(start_page,end_page+1): print("正在爬取第%s页"%page) request = headle_request(page,url,baming) download(request,baming,page) # 设置时间间隔,防止网站识别为恶意攻击 time.sleep(2)# 构建请求对象,拼接urldef headle_request(page,url,baming): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} pn = (page-1)*50 data = { "kw": baming, "ie": "utf8", "pn": pn } data = urllib.parse.urlencode( data ) url += data request = urllib.request.Request( url, headers=headers ) return request# 根据请求对象下载指定的目标数据def download(request,baming,page): # 获取响应数据 response = urllib.request.urlopen(request) # 创建文件存储的文件夹 if not os.path.exists(baming): os.mkdir(baming) # 拼接文件名 filename =baming+"第%s页"%page+".html" print(filename) # 拼接文件路径 filepath = os.path.join(baming,filename) # 将相应的数据存储起来 with open(filepath,"wb") as tf: tf.write(response.read())if __name__ == '__main__': header()

转载于:https://www.cnblogs.com/kuangkuangduangduang/p/10369636.html


最新回复(0)