day

it2022-05-05  113

一requests 请求库爬取豆瓣电影信息 一请求url https: / /movie . douban. com/ top250 一 请求方式 GET 请求头 user- -agentcookies import requests import re def get_page(url): response=requests.get(url) return response def parse_index(html): movie_list=re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演:(.*?)主演:(.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)</span>.*?<span class="inq">(.*?)</span>',html,re.S) return movie_list def save_data(movie): top,m_url,name,daoyan,actor,year_type,point,commit,desc=movie year_type=year_type.strip('\n') data=f''' =================== 电影排名:{top} 电影url:{m_url} 电影名称:{name} 电影导演:{daoyan} 电影主演:{actor} 年份类型:{year_type} 电影评分:{point} 电影评论:{commit} 电影简介:{desc} ================== \n \n ''' print(data) with open('douban_top250.txt','a',encoding='utf-8') as f: f.write(data) print(f'电影:{name}写入成功...') if __name__ == '__main__': num=0 for line in range(10): url=f'https://movie.douban.com/top250?start={num}&filter=' num+=25 print(url) index_res=get_page(url) movie_list=parse_index(index_res.text) for movie in movie_list: save_data(movie)

二、selenium请求库  

       1.京东搜索

from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.wait import WebDriverWait import time driver=webdriver.Chrome(r'C:\Users\Administrator\Desktop\chromedriver.exe') try: driver.get('https://www.jd.com/') wait=WebDriverWait(driver,10) input_tag=wait.until(EC.presence_of_element_located((By.ID,'key'))) time.sleep(5) input_tag.send_keys('公仔') input_tag.send_keys(Keys.ENTER) time.sleep(20) finally: driver.close()

 2.百度登录

from selenium import webdriver # web驱动 from selenium.webdriver.common.keys import Keys # 键盘按键操作 import time import time driver = webdriver.Chrome(r'C:\Users\Administrator\Desktop\chromedriver.exe') try: # 隐式等待: 需要在get之前调用 # 等待任意元素加载10秒 driver.implicitly_wait(10) driver.get('https://www.baidu.com/') # 显式等待: 需要在get之后调用 time.sleep(5) ''' ===============所有方法=================== element是查找一个标签 elements是查找所有标签 ''' # 自动登录百度 start # 1、find_element_by_link_text # 通过链接文本去找 login_link = driver.find_element_by_link_text('登录') login_link.click() # 点击登录 time.sleep(1) # 2、find_element_by_id # 通过id去找 user_login = driver.find_element_by_id('TANGRAM__PSP_10__footerULoginBtn') user_login.click() time.sleep(1) # 3、find_element_by_class_name user = driver.find_element_by_class_name('pass-text-input-userName') user.send_keys('*******') # 4、find_element_by_name pwd = driver.find_element_by_name('password') pwd.send_keys('*******') submit = driver.find_element_by_id('TANGRAM__PSP_10__submit') submit.click() # 5、find_element_by_tag_name div = driver.find_elements_by_tag_name('div') print(div) time.sleep(20) finally: # 关闭浏览器释放操作系统资源 driver.close()

 

转载于:https://www.cnblogs.com/ZHKsuika/p/11119603.html


最新回复(0)