Python 用 request+lxml 爬取某东页面商品信息

it2022-05-05 141

先说一下结论，采用最尾端的代码，替换掉关键词即可，输出成excel文件，获取信息为标题，价格与链接

详细记录一下程序过程吧。首先是url的构建，最近比较关心行车记录仪，因此就以此进行关键词搜索吧，输入后url栏显示为：

https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&wq=行车记录仪&pvid=1ca…

…为个人信息，每人不一样的，然后看看哪些是不需要的信息，经砍，发现 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8 即可，构造翻页并删除不必要信息，记录url： https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=3&s=56 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=5&s=112 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=7&s=167 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=9&s= 223 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=11&s= 278 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=13&s= 333 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=15&s= 388 https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8&page=17&s= 443 主要就是page参数与s参数，其中page参数采用了2n+1，而s是一个无规律的数字。因此，还需要特殊构建一下。先放上代码：cookie 换成自己的哦~

import requests import csv import time import random import traceback from lxml import etree def url_list_l1(start_url, depth, s_list): u_list = [ start_url + '&page={}&s={}'.format(2 * i + 1, j) for i, j in enumerate(s_list)] return u_list[:depth] def get_text(url): hds = { 'cookie': '...', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'} try: r = requests.get(url, headers=hds, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except BaseException: traceback.print_exc() return '' def get_page_information(url_list, info_list): with open('information.csv', 'a', newline='', encoding='utf-8') as file: content = csv.writer(file) content.writerow(['Description', 'Price', 'URL']) for url in url_list: time.sleep(random.uniform(1.0, 2.0)) html = get_text(url) if html == '': continue else: response = etree.HTML(html) brands = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/em/text()[1]') titles = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/em/text()[2]') prices = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[3]/strong/i') urls = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/@href') for i in range(len(urls)): if urls[i][:5] == 'https': continue else: urls[i] = 'https:' + urls[i] # print(urls) for brand, title, price, url in zip(titles, brands, prices, urls): # ls = [] price = etree.tostring( price, encoding='utf-8', method='text').decode() # print(title, brand, price, url) info_list = [title+brand , price, url] with open('information.csv', 'a', newline='', encoding='utf-8') as file: content2 = csv.writer(file) content2.writerow(info_list) url_1 = 'https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8' depth = 8 special_list = [1, 56, 112, 167, 223, 278, 333, 388, 433] url_list_1 = url_list_l1(url_1, depth, special_list) # print(url_list_1) info_list = [] get_page_information(url_list_1, info_list)

这里有几个地方需要注意：（1）xpath的语法与构建，当需要获得文本时，请参考price，需要用etree.tostring().decode()方法进行解析，或者表达式上加入’/text()’

prices = response.xpath('//*[@id="J_goodsList"]/ul/li/div/div[3]/strong/i/text()')

此即可取代上述一系列的解码操作，如果text()分成两段，可以以索引得到

（2）当需要xpath属性是，参考url 方法，在xpath表达式后加上/@href

（3）写入csv文件的标准写法，先创建csv对象：

content2 = csv.writer(file)

然后再对该对象进行操作，以列表形式写入：

content2.writerow(info_list)

最后，文件中保存的UTF-8编码的csv文件如果想用excel打开会形成乱码，将其另存为ANSI编码格式，就可以很顺利的打开了，效果如下，还真是不便宜，嘿嘿：

更新一下代码：将price的xpath改掉，并优化一部分代码使其更简洁，以及一不做二不休，把文件直接导入到xls中：

import requests import time import random import traceback from lxml import etree import xlwt def url_list_l1(start_url, depth): u_list = [start_url + '&page={}&s={}'.format(2 * i + 1, 55 * i + 1) for i in range(depth)] return u_list def get_text(url): hds = { 'cookie': '。。。'} try: r = requests.get(url, headers=hds, timeout=30) r.raise_for_status() r.encoding = 'utf-8' return r.text except BaseException: traceback.print_exc() return '' def get_page_information(url_list): workbook = xlwt.Workbook() sheet1 = workbook.add_sheet('sales_information', cell_overwrite_ok=True) row_0 = ['Description', 'Price', 'URL'] for i in range(len(row_0)): sheet1.write(0,i,row_0[i]) count = 1 for url in url_list: time.sleep(random.uniform(1.0, 2.0)) html = get_text(url) if html == '': continue else: response = etree.HTML(html) brands = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/em/text()[1]') titles = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/em/text()[2]') prices = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[3]/strong/i/text()') urls = response.xpath( '//*[@id="J_goodsList"]/ul/li/div/div[4]/a/@href') for i in range(len(urls)): if urls[i][:5] == 'https': continue else: urls[i] = 'https:' + urls[i] for brand, title, price, url in zip(titles, brands, prices, urls): in_list = [title+brand , price, url] for j in range(len(in_list)): sheet1.write(count,j,in_list[j]) count += 1 workbook.save('Sales_information.xls') if __name__ == '__main__': url_1 = 'https://search.jd.com/Search?keyword=行车记录仪&enc=utf-8' depth = 8 url_list_1 = url_list_l1(url_1, depth) info_list = [] get_page_information(url_list_1)

专利

最新回复(0)