(1)、环境准备
requests + pymongo 库
(2)、页面分析
首先登录拉钩并输入关键字:爬虫工程师
我们可以发现这些数据都是js加载的
接着打开chrome的开发者工具选项并勾选XHR
我们发现我们需要的信息包含在result中
我们通过观察发现该请求为post请求
最后我们需要模拟该请求获取数据
(3)、代码实现
1 #author: "xian"
2 #date: 2018/5/29
3 import time
4 from pymongo
import MongoClient
5 import requests
6 from fake_useragent
import UserAgent
#提供User-Agent的库
7
8
9 client =
MongoClient()
10 db = client.lagou
#链接test数据库,没有则自动创建
11 my_set = db.job2
#使用set集合,没有自动创建
12
13 ua = UserAgent()
#初始化
14
15 url =
'https://www.lagou.com/jobs/positionAjax.json?city=上海&needAddtionalResult=false'
16
17 headers =
{
18 'Cookies':
'WEBTJ-ID=20180529193719-163abb02b0d71e-0a6a4200015d64-39614807-921600-163abb02b0f975; _ga=GA1.2.1134699470.1527593839; _gid=GA1.2.774560940.1527593839; Hm_lvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1527593839; user_trace_token=20180529193719-a4f52e51-6334-11e8-b37e-525400f775ce; LGSID=20180529193719-a4f53070-6334-11e8-b37e-525400f775ce; PRE_UTM=m_cf_cpt_baidu_pc; PRE_HOST=www.baidu.com; PRE_SITE=https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=0&rsv_idx=1&tn=baidu&wd=%E6%8B%89%E5%8B%BE%E7%BD%91&rsv_pq=85d8099500018bc0&rsv_t=08dcbrFGEAkrdozpbP9VW5IPYZ%2FKdnnU6DckaBTAZPPDI9zjjeFH4jkR3Lk&rqlang=cn&rsv_enter=1&rsv_sug3=11&rsv_sug1=13&rsv_sug7=100; PRE_LAND=https://www.lagou.com/lp/html/common.html?utm_source=m_cf_cpt_baidu_pc; LGUID=20180529193719-a4f5323c-6334-11e8-b37e-525400f775ce; X_HTTP_TOKEN=d9c4fe4fb6df7a49db968f666c1e2bcd; _putrc=17CC21EA917C179E123F89F2B170EADC; JSESSIONID=ABAAABAAAGFABEFCDA862BCEB8F11FB3B97530AAA468C39; login=true; unick=陆智贤; showExpriedIndex=1; showExpriedCompanyHome=1; showExpriedMyPublish=1; hasDeliver=0; gate_login_token=a5d16c416fb70f73c1c1190d6c2443500230225d67aef0a0a2e0df73094566cd; index_location_city=上海; SEARCH_ID=995bf702a0534ef496cbb491ff7875be; _gat=1; LGRID=20180529194050-22d3aa6d-6335-11e8-b380-525400f775ce; Hm_lpvt_4233e74dff0ae5bd0a3d81c6ccf756e6=1527594051; TG-TRACK-CODE=search_code',
19 'User-Agent':ua.random,
20 'Referer':
'https://www.lagou.com/jobs/list_爬虫工程师?labelWords=&fromSearch=true&suginput=',
21 #拉钩网对Referer提出了限制
22
23 }
24
25 def get_job_info(page,keyword):
26 for i
in range(page):
27 payload =
{
28 'first':
'true',
29 'pn': i,
30 'kd': keyword,
31
32 }
33
34 response = requests.get(url, data = payload , headers =
headers)
35
36 if response.status_code == 200
:
37 time.sleep(2
)
38 print(
"正在爬取%s页!" %(i+1
))
39 my_set.insert(response.json()[
'content'][
'positionResult'][
'result'])
#爬取content positionResult result字段信息
40 else:
41 print(
'爬取失败!')
42
43 if __name__ in '__main__':
44 get_job_info(5,
'爬虫工程师')
(4)、效果展示
转载于:https://www.cnblogs.com/518894-lu/p/9107424.html
相关资源:python爬虫实例之拉钩网职位信息(更新日期2018年4月21日,使用scrapy,保存到本地json和mongo、mysql(同步和异步)数据库)