爬虫学习(十)——原始正则抓取数据案例

it2022-05-05  154

糗事百科图片抓取案例

打算发大水import osimport reimport timeimport urllib.requestimport urllib.parse# 输入目标页码和图片存储名def header(): start_page = int(input("请输入起始页")) end_page = int(input("请输入结束页")) qiutu = input("请输入文件名字") # 对目标页码进行爬取 for page in range(start_page,end_page+1): print("正在爬取第%s页"%page) request = headle_request(page) download(request,page,qiutu) # 设置时间间隔,防止网站识别为恶意攻击 time.sleep(2)# 构建请求对象,拼接urldef headle_request(page): headers = {"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36"} url = "https://www.qiushibaike.com/pic/page/%s/?s=5167052"%page request = urllib.request.Request( url, headers=headers ) return request# 根据请求对象下载指定的目标数据def download(request,qiutu): # 获取响应数据 response = urllib.request.urlopen(request) # 创建文件存储的文件夹 if not os.path.exists(qiutu): os.mkdir(qiutu) content = response.read().decode("utf8") # 正则表达式的编写,目标是获取图片的url【重点】 img = re.compile(r'<div class="thumb">.*?<img src="(.*?)" alt="(.*?)" />.*?</div>',re.S) # 正则表达式匹配目标标签, # 返回的是一个列表 ret = img.findall(content) for x in ret: img_url = "http:"+x[0] # 构建图片的文件名和格式 filename = x[1]+".jpg" # 构建图片的存储路径 image_path = os.path.join(qiutu,filename) urllib.request.urlretrieve(img_url,image_path) time.sleep(1.5)if __name__ == '__main__': header()

励志网语录抓取案例

import osimport reimport timeimport urllib.requestimport urllib.parsedef main(): start_page = int(input("请输入抓取的起始页:")) end_page = int(input("请输入抓取的结束页:")) for page in range(start_page,end_page+1): print("正在爬取第%d"%page) ret = request(page) content(ret)def request(page): headers = {"User - Agent": "Mozilla / 5.0( Windows NT 6.1;WOW64) AppleWebKit / 537.36( KHTML, likeGecko) Chrome / 72.0.3626.96Safari / 537.36"} url =" http://www.yikexun.cn/lizhi/qianming/list_50_%s.html"%page request = urllib.request.Request(url,headers=headers) response = urllib.request.urlopen(request).read().decode("utf8") pattern = re.compile(r'<div class="art-t">.*?<a href="(.*?)"><b>(.*?)</b></a>.*?</div>',re.S) ret = pattern.findall( response ) return retdef content(ret): if not os.path.exists("励志语录1"): os.mkdir("励志语录1") for content in ret: title = content[1]+".html" article_url="http://www.yikexun.cn"+content[0] article_path = os.path.join( "励志语录1",title) response = urllib.request.urlopen(article_url) string = response.read().decode("utf8") regular =re.compile(r'(<div class="neirong">.*?<p>(.*?)</p>.*?</div>)',re.S) neirong = regular.findall(string) for info in neirong: cont = '<h1 style="color:blue">%s</h1>\n%s'%(content[1],info[0]) with open(article_path,"w",encoding="utf8") as tf: tf.write(cont) tf.close() time.sleep(1)if __name__ == '__main__': main()

转载于:https://www.cnblogs.com/kuangkuangduangduang/p/10374888.html


最新回复(0)