from selenium
import webdriver
import time
from lxml
import etree
import re
class LagouSpider(object):
def __init__(self):
self.driver =
webdriver.Chrome()
self.url =
"https://www.lagou.com/jobs/list_python?px=default&city=全国#filterBox"
def run(self):
self.driver.get(self.url)
while True:
source =
self.driver.page_source
self.parse_page_list(source)
next_btn = self.driver.find_element_by_xpath(
"//div[@class='pager_container']/span[last()]")
if "pager_next_disabled" in next_btn.get_attribute(
"class"):
break
else:
next_btn.click()
def parse_page_list(self, source):
html =
etree.HTML(source)
detail_urls = html.xpath(
"//div/a[@class='position_link']/@href")
for detail_url
in detail_urls:
self.get_detail_page(detail_url)
time.sleep(1
)
def get_detail_page(self, detail_url):
# self.driver.get(detail_url)
# 新打开一个窗口
self.driver.execute_script(
"window.open('%s')" %
detail_url)
self.driver.switch_to.window(self.driver.window_handles[1
])
source =
self.driver.page_source
self.parse_datail_page(source)
# 关闭该窗口
self.driver.close()
# 继续切换回职位列表页
self.driver.switch_to.window(self.driver.window_handles[0])
def parse_datail_page(self, source):
html =
etree.HTML(source)
job_name = html.xpath(
"//div[@class='job-name']/h2/text()")[0].strip()
job_request_spans = html.xpath(
"//dd[@class='job_request']//span")
job_salary = job_request_spans[0].xpath(
"./text()")[0].strip()
city = job_request_spans[1].xpath(
"./text()")[0].strip()
city = re.sub(r
'[/\s]',
'', city)
work_year = job_request_spans[2].xpath(
"./text()")[0].strip()
work_year = re.sub(r
'[/\s]',
'', work_year)
education = job_request_spans[3].xpath(
"./text()")[0].strip()
education = re.sub(r
'[/\s]',
'', education)
company_name = html.xpath(
"//h3[@class='fl']//text()")[0].strip()
desc =
"".join(html.xpath(
"//dl[@id='job_detail']/dd[@class='job_bt']//text()")).strip()
desc = re.sub(r
'[/\s\\xa]',
'', desc)
position =
{
"name": job_name,
"job_salary": job_salary,
"city": city,
"work_year": work_year,
"education": education,
"company_name": company_name,
"desc": desc
}
print(position)
lagou =
LagouSpider()
lagou.run()
View Code
转载于:https://www.cnblogs.com/ForT/p/11152098.html