Python用Selenium和Chromedriver爬取智联招聘的职位

it2022-05-05  239

步骤:

1.在智联招聘网站选择好职位关键词和作用地址。 2.运行代码。

其中注意点 1.用driver爬取首页时,会弹出如下图窗口。此时可以在代码中设置睡眠2秒,自己手动取消窗口。 2.在这一次爬取中,发现网页的翻页按钮不能单纯用buttonTag.click()点击,于是改为self.driver.execute_script("arguments[0].click()",nextBtn)

代码如下

# encoding: utf-8 from lxml import etree from selenium import webdriver import time from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import csv class ZhiLian(object): driver_path = r"C:\...\chromedriver.exe" def __init__(self): self.driver = webdriver.Chrome(executable_path=ZhiLian.driver_path) self.url = "https://sou.zhaopin.com/?jl=768&sf=0&st=0&kw=研发工程师&kt=3" self.fp = open('zhilian_yanfa.csv','a',encoding='utf-8') self.writer = csv.DictWriter(self.fp,['title','salary','address','work_years','education','desc', 'detail_address','link']) self.positions = [] def run(self): self.driver.get(self.url) time.sleep(2) while True: WebDriverWait(self.driver, timeout=10).until( EC.presence_of_element_located((By.XPATH, "//button[contains(@class,'soupager__btn')]")) ) source = self.driver.page_source self.parse_list_page(source) time.sleep(2) nextBtn = self.driver.find_element_by_xpath("//div[@class='soupager']/button[2]") if "disable" in nextBtn.get_attribute('class'): break # nextBtn.click() self.driver.execute_script("arguments[0].click()",nextBtn) def parse_list_page(self,source): html = etree.HTML(source) links = html.xpath("//div[@class='contentpile__content__wrapper__item clearfix']//@href") for link in links: if "jobs" in link: self.parse_detail_page(link) time.sleep(1) def parse_detail_page(self,url): self.driver.execute_script("window.open('%s')"%url) self.driver.switch_to.window(self.driver.window_handles[1]) WebDriverWait(self.driver,timeout=10).until( EC.presence_of_element_located((By.CLASS_NAME,"describtion__detail-content")) ) source = self.driver.page_source html = etree.HTML(source) title = html.xpath("//h3[@class='summary-plane__title']/text()")[0] salary = html.xpath("//span[@class='summary-plane__salary']/text()")[0] address = "".join(html.xpath("//ul[@class='summary-plane__info']/li[1]//text()")) work_years = html.xpath("//ul[@class='summary-plane__info']/li[2]//text()")[0] education = html.xpath("//ul[@class='summary-plane__info']/li[3]//text()")[0] desc = "".join(html.xpath("//div[@class='describtion']//text()")) detail_address = html.xpath("//span[@class='job-address__content-text']//text()")[0] position = { 'title': title, 'salary': salary, 'address': address, 'work_years': work_years, 'education': education, 'desc': desc, 'detail_address': detail_address, 'link':url } self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) self.write_position(position) def write_position(self,position): if len(self.positions) >=10: self.writer.writerows(self.positions) self.positions.clear() self.positions.append(position) print(position['title']) if __name__ == '__main__': spider = ZhiLian() spider.run()

最新回复(0)