.解析库之bs4:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 '''''' ''' pip3 install beautifulsoup4 # 安装bs4 pip3 install lxml # 下载lxml解析器 ''' html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="sister"><b>$37</b></p> <p class="story" id="p">Once upon a time there were three little sisters; and their names were <a href="http://example.com/elsie" class="sister" >Elsie</a>, <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ # 从bs4中导入BeautifulSoup from bs4 import BeautifulSoup # 调用BeautifulSoup实例化得到一个soup对象 # 参数一: 解析文本 # 参数二: # 参数二: 解析器(html.parser、lxml...) soup = BeautifulSoup(html_doc, 'lxml' ) print (soup) print ( '*' * 100 ) print ( type (soup)) print ( '*' * 100 ) # 文档美化 html = soup.prettify() print (html)2.bs之遍历文档树:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'lxml' ) ''' 遍历文档树: 1、直接使用 2、获取标签的名称 3、获取标签的属性 4、获取标签的内容 5、嵌套选择 6、子节点、子孙节点 7、父节点、祖先节点 8、兄弟节点 ''' # 1、直接使用 print (soup.p) # 查找第一个p标签 print (soup.a) # 查找第一个a标签 # 2、获取标签的名称 print (soup.head.name) # 获取head标签的名称 # 3、获取标签的属性 print (soup.a.attrs) # 获取a标签中的所有属性 print (soup.a.attrs[ 'href' ]) # 获取a标签中的href属性 # 4、获取标签的内容 print (soup.p.text) # $37 # 5、嵌套选择 print (soup.html.head) # 6、子节点、子孙节点 print (soup.body.children) # body所有子节点,返回的是迭代器对象 print ( list (soup.body.children)) # 强转成列表类型 print (soup.body.descendants) # 子孙节点 print ( list (soup.body.descendants)) # 子孙节点 # 7、父节点、祖先节点 print (soup.p.parent) # 获取p标签的父亲节点 # 返回的是生成器对象 print (soup.p.parents) # 获取p标签所有的祖先节点 print ( list (soup.p.parents)) # 8、兄弟节点 # 找下一个兄弟 print (soup.p.next_sibling) # 找下面所有的兄弟,返回的是生成器 print (soup.p.next_siblings) print ( list (soup.p.next_siblings)) # 找上一个兄弟 print (soup.a.previous_sibling) # 找到第一个a标签的上一个兄弟节点 # 找到a标签上面的所有兄弟节点 print (soup.a.previous_siblings) # 返回的是生成器 print ( list (soup.a.previous_siblings))3.bs之搜索文档树:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 '''''' html_doc = """<html><head><title>The Dormouse's story</title></head><body><p class="sister"><b>$37</b></p><p class="story" id="p">Once upon a time there were three little sisters; and their names were<b>tank</b><a href="http://example.com/elsie" class="sister" >Elsie</a>,<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;and they lived at the bottom of a well.<hr></hr></p><p class="story">...</p>""" ''' 搜索文档树: find() 找一个 find_all() 找多个 标签查找与属性查找: 标签: name 属性匹配 attrs 属性查找匹配 text 文本匹配 - 字符串过滤器 字符串全局匹配 - 正则过滤器 re模块匹配 - 列表过滤器 列表内的数据匹配 - bool过滤器 True匹配 - 方法过滤器 用于一些要的属性以及不需要的属性查找。 属性: - class_ - id ''' from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc, 'lxml' ) # # 字符串过滤器 # # name p_tag = soup.find(name = 'p' ) print (p_tag) # 根据文本p查找某个标签 # 找到所有标签名为p的节点 tag_s1 = soup.find_all(name = 'p' ) print (tag_s1) # # # # attrs # # 查找第一个class为sister的节点 p = soup.find(attrs = { "class" : "sister" }) print (p) # # 查找所有class为sister的节点 tag_s2 = soup.find_all(attrs = { "class" : "sister" }) print (tag_s2) # # # # text text = soup.find(text = "$37" ) print (text) # # # # 配合使用: # # 找到一个id为link2、文本为Lacie的a标签 a_tag = soup.find(name = "a" , attrs = { "id" : "link2" }, text = "Lacie" ) print (a_tag) # # 正则过滤器 import re # name p_tag = soup.find(name = re. compile ( 'p' )) print (p_tag) # 列表过滤器 import re # name tags = soup.find_all(name = [ 'p' , 'a' , re. compile ( 'html' )]) print (tags) # - bool过滤器 # True匹配 # 找到有id的p标签 p = soup.find(name = 'p' , attrs = { "id" : True }) print (p) # 方法过滤器 # 匹配标签名为a、属性有id没有class的标签 def have_id_class(tag): if tag.name = = 'a' and tag.has_attr( 'id' ) and tag.has_attr( 'class' ): return tag tag = soup.find(name = have_id_class) print (tag)4.爬取豌豆荚:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 ''' 主页: 图标地址、下载次数、大小、详情页地址 详情页: 游戏名、图标名、好评率、评论数、小编点评、简介、网友评论、1-5张截图链接地址、下载地址 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 32 ''' import requests from bs4 import BeautifulSoup # 1、发送请求 def get_page(url): response = requests.get(url) return response # 2、开始解析 # 解析主页 def parse_index(data): soup = BeautifulSoup(data, 'lxml' ) # 获取所有app的li标签 app_list = soup.find_all(name = 'li' , attrs = { "class" : "card" }) for app in app_list: # print('tank *' * 1000) # print(app) # 图标地址 img = app.find(name = 'img' ).attrs[ 'data-original' ] print (img) # 下载次数 down_num = app.find(name = 'span' , attrs = { "class" : "install-count" }).text print (down_num) import re # 大小 size = soup.find(name = 'span' , text = re. compile ( "\d+MB" )).text print (size) # 详情页地址 detail_url = soup.find(name = 'a' , attrs = { "class" : "detail-check-btn" }).attrs[ 'href' ] print (detail_url) def main(): for line in range ( 1 , 33 ): url = f "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B" # 1、往app接口发送请求 response = get_page(url) # print(response.text) print ( '*' * 1000 ) # 反序列化为字典 data = response.json() # 获取接口中app标签数据 app_li = data[ 'data' ][ 'content' ] # print(app_li) # 2、解析app标签数据 parse_index(app_li) if __name__ = = '__main__' : main() ? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 ''' 主页: 图标地址、下载次数、大小、详情页地址 详情页: 游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 32 ''' import requests from bs4 import BeautifulSoup # 1、发送请求 def get_page(url): response = requests.get(url) return response # 2、开始解析 # 解析详情页 def parse_detail(text): soup = BeautifulSoup(text, 'lxml' ) # print(soup) # app名称 name = soup.find(name = "span" , attrs = { "class" : "title" }).text # print(name) # 好评率 love = soup.find(name = 'span' , attrs = { "class" : "love" }).text # print(love) # 评论数 commit_num = soup.find(name = 'a' , attrs = { "class" : "comment-open" }).text # print(commit_num) # 小编点评 commit_content = soup.find(name = 'div' , attrs = { "class" : "con" }).text # print(commit_content) # app下载链接 download_url = soup.find(name = 'a' , attrs = { "class" : "normal-dl-btn" }).attrs[ 'href' ] # print(download_url) print ( f ''' ============= tank ============== app名称:{name} 好评率: {love} 评论数: {commit_num} 小编点评: {commit_content} app下载链接: {download_url} ============= end ============== ''' ) # 解析主页 def parse_index(data): soup = BeautifulSoup(data, 'lxml' ) # 获取所有app的li标签 app_list = soup.find_all(name = 'li' , attrs = { "class" : "card" }) for app in app_list: # print(app) # print('tank' * 1000) # print('tank *' * 1000) # print(app) # 图标地址 # 获取第一个img标签中的data-original属性 img = app.find(name = 'img' ).attrs[ 'data-original' ] print (img) # 下载次数 # 获取class为install-count的span标签中的文本 down_num = app.find(name = 'span' , attrs = { "class" : "install-count" }).text print (down_num) import re # 大小 # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本 size = soup.find(name = 'span' , text = re. compile ( "\d+MB" )).text print (size) # 详情页地址 # 获取class为detail-check-btn的a标签中的href属性 # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href'] # print(detail_url) # 详情页地址 detail_url = app.find(name = 'a' ).attrs[ 'href' ] print (detail_url) # 3、往app详情页发送请求 response = get_page(detail_url) # 4、解析app详情页 parse_detail(response.text) def main(): for line in range ( 1 , 33 ): url = f "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B" # 1、往app接口发送请求 response = get_page(url) # print(response.text) print ( '*' * 1000 ) # 反序列化为字典 data = response.json() # 获取接口中app标签数据 app_li = data[ 'data' ][ 'content' ] # print(app_li) # 2、解析app标签数据 parse_index(app_li) if __name__ = = '__main__' : main()5.mongoDB的简单使用:
MongoDB 非关系型数据库一 安装与使用1、下载安装https://www.mongodb.com/download-center/community
2、在C盘创建一个data/db文件夹- 数据的存放路径
3、mongod启动服务 进入终端,输入mongod启动mongoDB服务。
4、mongo进入mongoDB客户端 打开一个新的终端,输入mongo进入客户端
二 数据库操作
数据库操作:切换库:SQL:use admin; 有则切换,无则报错。
MongoDB: use tank; 有则切换,无则创建,并切换tank库中。
查数据库:SQL: show databases;
MongoDB: show dbs;显示的数据库若无数据,则不显示。
删除库:SQL: drop database
MongoDB: db.dropDatabase()
集合操作: MySQL中叫做表。 创建集合:SQL: create table f1, f2...
MongoDB: # 在当前库中通过.来创建集合db.student
插入数据: # 插入多条数据db.student.insert([{"name1": "tank1"}, {"name2": "tank2"}])
# 插入一条db.student.insert({"name": "tank"})
查数据: # 查找student集合中所有数据db.student.find({})
# 查一条 查找name为tank的记录db.student.find({"name":"tank"})
三 python链接MongoDB1、下载第三方模块pymongo pip3 install pymongo
2、链接mongoDB客户端client = MongoClient('localhost', 27017)
6.pymongo简单使用:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 from pymongo import MongoClient # 1、链接mongoDB客户端 # 参数1: mongoDB的ip地址 # 参数2: mongoDB的端口号 默认:27017 client = MongoClient( 'localhost' , 27017 ) # print(client) # 2、进入tank_db库,没有则创建 # print(client['tank_db']) # 3、创建集合 # print(client['tank_db']['people']) # 4、给tank_db库插入数据 # 1.插入一条 data1 = { 'name' : 'tank' , 'age' : 18 , 'sex' : 'male' } client[ 'tank_db' ][ 'people' ].insert(data1) # 2.插入多条 data1 = { 'name' : 'tank' , 'age' : 18 , 'sex' : 'male' } data2 = { 'name' : '戚志云' , 'age' : 84 , 'sex' : 'female' } data3 = { 'name' : '沈金金' , 'age' : 73 , 'sex' : 'male' } client[ 'tank_db' ][ 'people' ].insert([data1, data2, data3]) # # # 5、查数据 # # 查看所有数据 data_s = client[ 'tank_db' ][ 'people' ].find() print (data_s) # <pymongo.cursor.Cursor object at 0x000002EEA6720128> # # 需要循环打印所有数据 for data in data_s: print (data) # # 查看一条数据 data = client[ 'tank_db' ][ 'people' ].find_one() print (data) # 官方推荐使用 # 插入一条insert_one client[ 'tank_db' ][ 'people' ].insert_one() # 插入多条insert_many client[ 'tank_db' ][ 'people' ].insert_many()二.作业:
1、整理课堂内容,并写博客
2、基于豌豆荚爬取剩下的简介截图图片地址、网友评论
3、把豌豆荚爬取的数据插入mongoDB中- 创建一个wandoujia库- 把主页的数据存放一个名为index集合中- 把详情页的数据存放一个名为detail集合中
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 ''' 主页: 图标地址、下载次数、大小、详情页地址 详情页: 游戏名、好评率、评论数、小编点评、下载地址、简介、网友评论、1-5张截图链接地址、 https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=1&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=2&ctoken=FRsWKgWBqMBZLdxLaK4iem9B https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page=3&ctoken=FRsWKgWBqMBZLdxLaK4iem9B 32 ''' import requests from bs4 import BeautifulSoup from pymongo import MongoClient ''' 3、把豌豆荚爬取的数据插入mongoDB中 - 创建一个wandoujia库 - 把主页的数据存放一个名为index集合中 - 把详情页的数据存放一个名为detail集合中 ''' # 连接MongoDB客户端 client = MongoClient( 'localhost' , 27017 ) # 创建或选择wandoujia库,index集合 index_col = client[ 'wandoujia' ][ 'index' ] # 创建或选择wandoujia库,detail集合 detail_col = client[ 'wandoujia' ][ 'detail' ] # 1、发送请求 def get_page(url): response = requests.get(url) return response # 2、开始解析 # 解析详情页 def parse_detail(text): soup = BeautifulSoup(text, 'lxml' ) # print(soup) # app名称 try : name = soup.find(name = "span" , attrs = { "class" : "title" }).text except Exception: # 若有异常,设置为None name = None # print(name) # 好评率 try : love = soup.find(name = 'span' , attrs = { "class" : "love" }).text except Exception: love = None # print(love) # 评论数 try : commit_num = soup.find(name = 'a' , attrs = { "class" : "comment-open" }).text except Exception: commit_num = None # print(commit_num) # 小编点评 try : commit_content = soup.find(name = 'div' , attrs = { "class" : "con" }).text except Exception: commit_content = None # print(commit_content) # app下载链接 try : download_url = soup.find(name = 'a' , attrs = { "class" : "normal-dl-btn" }).attrs[ 'href' ] except Exception: # 若有异常,设置为None download_url = None # print(download_url) # print( # f''' # ============= tank ============== # app名称:{name} # 好评率: {love} # 评论数: {commit_num} # 小编点评: {commit_content} # app下载链接: {download_url} # ============= end ============== # ''' # ) # 判断所有数据都存在,正常赋值 if name and love and commit_num and commit_content and download_url : detail_data = { 'name' : name, 'love' : love, 'commit_num' : commit_num, 'commit_content' : commit_content, 'download_url' : download_url } # 若love没有值,则设置为 没人点赞,很惨 if not love: detail_data = { 'name' : name, 'love' : "没人点赞,很惨" , 'commit_num' : commit_num, 'commit_content' : commit_content, 'download_url' : download_url } # 若download_url没有值,则设置为 没有安装包 if not download_url: detail_data = { 'name' : name, 'love' : love, 'commit_num' : commit_num, 'commit_content' : commit_content, 'download_url' : '没有安装包' } # 插入详情页数据 detail_col.insert(detail_data) print (f '{name}app数据插入成功!' ) # 解析主页 def parse_index(data): soup = BeautifulSoup(data, 'lxml' ) # 获取所有app的li标签 app_list = soup.find_all(name = 'li' , attrs = { "class" : "card" }) for app in app_list: # print(app) # print('tank' * 1000) # print('tank *' * 1000) # print(app) # 图标地址 # 获取第一个img标签中的data-original属性 img = app.find(name = 'img' ).attrs[ 'data-original' ] # print(img) # 下载次数 # 获取class为install-count的span标签中的文本 down_num = app.find(name = 'span' , attrs = { "class" : "install-count" }).text # print(down_num) import re # 大小 # 根据文本正则获取到文本中包含 数字 + MB(\d+代表数字)的span标签中的文本 size = soup.find(name = 'span' , text = re. compile ( "\d+MB" )).text # print(size) # 详情页地址 # 获取class为detail-check-btn的a标签中的href属性 # detail_url = soup.find(name='a', attrs={"class": "name"}).attrs['href'] # print(detail_url) # 详情页地址 detail_url = app.find(name = 'a' ).attrs[ 'href' ] # print(detail_url) # 拼接数据 index_data = { 'img' : img, 'down_num' : down_num, 'size' : size, 'detail_url' : detail_url } # 插入数据 index_col.insert(index_data) print ( '主页数据插入成功!' ) # 3、往app详情页发送请求 response = get_page(detail_url) # 4、解析app详情页 parse_detail(response.text) def main(): for line in range ( 1 , 33 ): url = f "https://www.wandoujia.com/wdjweb/api/category/more?catId=6001&subCatId=0&page={line}&ctoken=FRsWKgWBqMBZLdxLaK4iem9B" # 1、往app接口发送请求 response = get_page(url) # print(response.text) print ( '*' * 1000 ) # 反序列化为字典 data = response.json() # 获取接口中app标签数据 app_li = data[ 'data' ][ 'content' ] # print(app_li) # 2、解析app标签数据 parse_index(app_li) # 执行完所有函数关闭mongoDB客户端 client.close() if __name__ = = '__main__' : main()转载于:https://www.cnblogs.com/yang-haha/p/11065206.html