#coding:utf-8
import requests
import json
from lxml
import etree
import threading
from queue
import Queue
class QiushiSpide(object):
def __init__(self):
self.url_tmp =
"https://www.qiushibaike.com/8hr/page/{}/"
self.header = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36"}
self.pre_url =
"https://www.qiushibaike.com"
self.url_queue =
Queue()
self.html_queue =
Queue()
self.content_queue =
Queue()
def get_url_list(self):
for i
in range(1,14
):
self.url_queue.put(self.url_tmp.format(i))
print(self.url_queue.qsize())
# return [self.url_tmp.format(i) for i in range(1,14)]
def parse_url(self):
while True:
url =
self.url_queue.get()
print(url)
response =
requests.get(url, self.header)
self.html_queue.put(response.content.decode())
self.url_queue.task_done()
print(
"url_queue 完成一个")
# return response.content.decode()
def get_content_list(self):
while True:
html_str =
self.html_queue.get()
html =
etree.HTML(html_str)
li_list = html.xpath(
"//li[contains(@class,'item typs_')]")
content_list=
[]
for li
in li_list:
item =
{}
img_list = li.xpath(
".//a[contains(@class,'recmd-left')]")
for img
in img_list:
item["img_url"] =
"https:" + img.xpath(
"./img/@src")[0]
if len(img.xpath(
"./img/@src"))>0
else None
div_list = li.xpath(
".//div[@class='recmd-right']")
for div
in div_list:
item["text"] = div.xpath(
"./a/text()")[0]
if len(div.xpath(
"./a/text()"))>0
else None
item["a_href"] = self.pre_url + div.xpath(
"./a/@href")[0]
if len(div.xpath(
"./a/@href"))>0
else None
item["smile_num"] = div.xpath(
".//div[@class='recmd-num']/span[1]/text()")[0]
if len(div.xpath(
".//div[@class='recmd-num']/span[1]"))>0
else None
item["comment_num"] = div.xpath(
".//div[@class='recmd-num']/span[4]/text()")[0]
if len(div.xpath(
".//div[@class='recmd-num']/span[4]"))>0
else None
content_list.append(item)
self.content_queue.put(content_list)
self.html_queue.task_done()
print(
"html_queue 完成一个")
# return content_list
def save_content(self):
while True:
content =
self.content_queue.get()
with open("糗百多线程.txt",
'a',encoding=
'utf-8') as f:
f.write(json.dumps(content,ensure_ascii=False,indent=2
))
f.write("\n")
self.content_queue.task_done()
def run(self):
# url_list = self.get_url_list()
# for url in url_list:
# print(url)
# html_str = self.parse_url(url)
# content = self.get_content_list(html_str)
# self.save_content(content)
t_list =
[]
self.get_url_list()
for i
in range(4
):
p = threading.Thread(target=
self.parse_url)
t_list.append(p)
print(
"添加parse_url线程结束")
for i
in range(4
):
g = threading.Thread(target=
self.get_content_list)
t_list.append(g)
print(
"添加get_content_list线程结束")
s = threading.Thread(target=
self.save_content)
t_list.append(s)
for t
in t_list:
t.setDaemon(True) #守护线程,该线程不重要,主线程结束,子线程结束
t.start()
for q
in [self.url_queue,self.html_queue,self.content_queue]:
q.join() #让主线程等待阻塞,等待队列的任务执行完了后再 完成
print(
"主线程end")
if __name__ ==
"__main__":
q =
QiushiSpide()
q.run()
转载于:https://www.cnblogs.com/dreamhai/p/10575359.html
相关资源:各显卡算力对照表!