6.18-python学习
一.今日内容:
1.关于selenium选择器xpath的使用:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 from selenium import webdriver driver = webdriver.Chrome(r 'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe' ) try : # 隐式等待: 写在get请求前 driver.implicitly_wait( 5 ) driver.get( 'https://doc.scrapy.org/en/latest/_static/selectors-sample1.html' ) # 显式等待: 写在get请求后 # wait.until(...) ''' <html> <head> <base href='http://example.com/' /> <title>Example website</title> </head> <body> <div id='images'> <a href='image1.html'>Name: My image 1 <br /><img src='image1_thumb.jpg' /></a> <a href='image2.html'>Name: My image 2 <br /><img src='image2_thumb.jpg' /></a> <a href='image3.html'>Name: My image 3 <br /><img src='image3_thumb.jpg' /></a> <a href='image4.html'>Name: My image 4 <br /><img src='image4_thumb.jpg' /></a> <a href='image5.html'>Name: My image 5 <br /><img src='image5_thumb.jpg' /></a> </div> </body> </html> ''' # 根据xpath语法查找元素 # / 从根节点开始找第一个 html = driver.find_element_by_xpath( '/html' ) # html = driver.find_element_by_xpath('/head') # 报错 print (html.tag_name) # // 从根节点开始找任意一个节点 div = driver.find_element_by_xpath( '//div' ) print (div.tag_name) # @ # 查找id为images的div节点 div = driver.find_element_by_xpath( '//div[@id="images"]' ) print (div.tag_name) print (div.text) # 找到第一个a节点 a = driver.find_element_by_xpath( '//a' ) print (a.tag_name) # 找到所有a节点 a_s = driver.find_elements_by_xpath( '//a' ) print (a_s) # 找到第一个a节点的href属性 # get_attribute:获取节点中某个属性 a = driver.find_element_by_xpath( '//a' ).get_attribute( 'href' ) print (a) finally : driver.close()2.selenium剩余更多操作:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 '''''' ''' 点击、清除操作 ''' from selenium import webdriver from selenium.webdriver.common.keys import Keys import time driver = webdriver.Chrome(r 'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe' ) try : driver.implicitly_wait( 10 ) # 1、往jd发送请求 driver.get( 'https://www.jd.com/' ) # 找到输入框输入围城 input_tag = driver.find_element_by_id( 'key' ) input_tag.send_keys( '围城' ) # 键盘回车 input_tag.send_keys(Keys.ENTER) time.sleep( 2 ) # 找到输入框输入墨菲定律 input_tag = driver.find_element_by_id( 'key' ) input_tag.clear() input_tag.send_keys( '墨菲定律' ) # 找到搜索按钮点击搜索 button = driver.find_element_by_class_name( 'button' ) button.click() time.sleep( 10 ) finally : driver.close() ''' 获取cookies (了解) ''' from selenium import webdriver import time driver = webdriver.Chrome(r 'D:\BaiduNetdiskDownload\chromedriver_win32\chromedriver.exe' ) try : driver.implicitly_wait( 10 ) driver.get( 'https://www.zhihu.com/explore' ) print (driver.get_cookies()) time.sleep( 10 ) finally : driver.close() ''' 选项卡 ''' #选项卡管理:切换选项卡,有js的方式windows.open,有windows快捷键: # ctrl+t等,最通用的就是js的方式 import time from selenium import webdriver browser = webdriver.Chrome() try : browser.get( 'https://www.baidu.com' ) # execute_script: 执行javascrpit代码 # 弹窗操作 # browser.execute_script('alert("tank")') # 新建浏览器窗口 browser.execute_script( ''' window.open(); ''' ) time.sleep( 1 ) print (browser.window_handles) # 获取所有的选项卡 # 切换到第二个窗口 # 新: browser.switch_to.window(browser.window_handles[ 1 ]) # 旧: # browser.switch_to_window(browser.window_handles[1]) # 第二个窗口往淘宝发送请求 browser.get( 'https://www.taobao.com' ) time.sleep( 5 ) # 切换到第一个窗口 browser.switch_to_window(browser.window_handles[ 0 ]) browser.get( 'https://www.sina.com.cn' ) time.sleep( 10 ) finally : browser.close() ''' ActionChangs动作链 ''' from selenium import webdriver from selenium.webdriver import ActionChains import time driver = webdriver.Chrome() driver.implicitly_wait( 10 ) driver.get( 'http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable' ) try : # driver.switch_to_frame('iframeResult') # 切换到id为iframeResult的窗口内 driver.switch_to.frame( 'iframeResult' ) # 源位置 draggable = driver.find_element_by_id( 'draggable' ) # 目标位置 droppable = driver.find_element_by_id( 'droppable' ) # 调用ActionChains,必须把驱动对象传进去 # 得到一个动作链对象,复制给一个变量 actions = ActionChains(driver) # 方式一: 机器人 # 瞬间把源图片位置秒移到目标图片位置 # actions.drag_and_drop(draggable, droppable) # 编写一个行为 # actions.perform() # 执行编写好的行为 # 方式二: 模拟人的行为 source = draggable.location[ 'x' ] target = droppable.location[ 'x' ] print (source, target) distance = target - source print (distance) # perform:每个动作都要调用perform执行 # 点击并摁住源图片 ActionChains(driver).click_and_hold(draggable).perform() s = 0 while s < distance: # 执行位移操作 ActionChains(driver).move_by_offset(xoffset = 2 , yoffset = 0 ).perform() s + = 2 # 释放动作链 ActionChains(driver).release().perform() time.sleep( 10 ) finally : driver.close() ''' 前进、后退 ''' from selenium import webdriver import time driver = webdriver.Chrome() try : driver.implicitly_wait( 10 ) driver.get( 'https://www.jd.com/' ) driver.get( 'https://www.baidu.com/' ) driver.get( 'https://www.cnblogs.com/' ) time.sleep( 2 ) # 回退操作 driver.back() time.sleep( 1 ) # 前进操作 driver.forward() time.sleep( 1 ) driver.back() time.sleep( 10 ) finally : driver.close()3.破解登录方法:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 from selenium import webdriver from selenium.webdriver import ChromeOptions import time r ''' 步骤: 1、打开文件的查看,显示隐藏文件 2、找到C:\Users\administortra\AppData\Local\Google\Chrome\User Data 删除Default文件 3、重新打开浏览器,并登陆百度账号 - 此时会创建一个新的Default缓存文件 4、添加cookies 5、关闭谷歌浏览器后执行程序 ''' # 获取options对象,参数对象 options = ChromeOptions() # 获取cookies保存路径 # 'C:\Users\administortra\AppData\Local\Google\Chrome\User Data' profile_directory = r '--user-data-dir=C:\Users\administortra\AppData\Local\Google\Chrome\User Data' # 添加用户信息目录 options.add_argument(profile_directory) # 把参数加载到当前驱动中 chrome_options默认参数,用来接收options对象 driver = webdriver.Chrome(chrome_options = options) try : driver.implicitly_wait( 10 ) driver.get( 'https://www.baidu.com/' ) ''' BDUSS:***** ''' # 添加用户cookies信息 # name、value必须小写 driver.add_cookie({ "name" : "BDUSS" , "value" : "用户session字符串" }) # 刷新操作 driver.refresh() time.sleep( 10 ) finally : driver.close()4.爬取京东商品信息:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 # '''''' # ''' # 爬取京东商品信息: # 请求url: # https://www.jd.com/ # 提取商品信息: # 1.商品详情页 # 2.商品名称 # 3.商品价格 # 4.评价人数 # 5.商品商家 # ''' from selenium import webdriver from selenium.webdriver.common.keys import Keys import time driver = webdriver.Chrome() try : driver.implicitly_wait( 10 ) # 1、往京东主页发送请求 driver.get( 'https://www.jd.com/' ) # 2、输入商品名称,并回车搜索 input_tag = driver.find_element_by_id( 'key' ) input_tag.send_keys( 'macbook' ) input_tag.send_keys(Keys.ENTER) time.sleep( 2 ) # 通过JS控制滚轮滑动获取所有商品信息 js_code = ''' window.scrollTo(0,5000); ''' driver.execute_script(js_code) # 执行js代码 # 等待数据加载 time.sleep( 2 ) # 3、查找所有商品div # good_div = driver.find_element_by_id('J_goodsList') good_list = driver.find_elements_by_class_name( 'gl-item' ) n = 1 for good in good_list: # 根据属性选择器查找 # 商品链接 good_url = good.find_element_by_css_selector( '.p-img a' ).get_attribute( 'href' ) # 商品名称 good_name = good.find_element_by_css_selector( '.p-name em' ).text.replace( "\n" , "--" ) # 商品价格 good_price = good.find_element_by_class_name( 'p-price' ).text.replace( "\n" , ":" ) # 评价人数 good_commit = good.find_element_by_class_name( 'p-commit' ).text.replace( "\n" , " " ) # 商品商家 good_from = good.find_element_by_class_name( 'J_im_icon' ).text.replace( "\n" , " " ) good_content = f ''' 商品链接: {good_url} 商品名称: {good_name} 商品价格: {good_price} 评价人数: {good_commit} 商品商家: {good_from} \n ''' print (good_content) with open ( 'jd.txt' , 'a' , encoding = 'utf-8' ) as f: f.write(good_content) next_tag = driver.find_element_by_link_text( '下一页' ) next_tag.click() time.sleep( 10 ) finally : driver.close()二.作业:
1.爬取京东商品信息:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 from selenium import webdriver from selenium.webdriver.common.keys import Keys import time def get_good(driver): try : js_code = ''' window.scrollTo(0,5000); ''' driver.execute_script(js_code) time.sleep( 2 ) good_list = driver.find_elements_by_class_name( 'gl-item' ) n = 1 for good in good_list: good_url = good.find_element_by_css_selector( '.p-img a' ).get_attribute( 'href' ) good_name = good.find_element_by_css_selector( '.p-name em' ).text.replace( "\n" , "--" ) good_price = good.find_element_by_class_name( 'p-price' ).text.replace( "\n" , ":" ) good_commit = good.find_element_by_class_name( 'p-commit' ).text.replace( "\n" , " " ) good_from = good.find_element_by_class_name( 'J_im_icon' ).text.replace( "\n" , " " ) good_content = f ''' 商品链接:{good_url} 商品名称:{good_name} 商品价格:{good_price} 评价人数:{good_commit} 商品商家:{good_from} \n ''' print (good_content) # with open('jd.txt', 'a', encoding='utf-8')as f: # f.write(good_content) time.sleep( 10 ) next_tag = driver.find_element_by_class_name( 'pn-next' ) next_tag.click() time.sleep( 2 ) get_good(driver) time.sleep( 10 ) finally : driver.close() if __name__ = = '__main__' : good_name = input ( '请输入商品名:' ).strip() driver = webdriver.Chrome() driver.implicitly_wait( 10 ) driver.get( "https://www.jd.com/" ) input_tag = driver.find_element_by_id( 'key' ) input_tag.send_keys(good_name) input_tag.send_keys(Keys.ENTER) time.sleep( 2 ) get_good(driver)转载于:https://www.cnblogs.com/yang-haha/p/11065183.html