【Python初级爬虫系列--01】python beautifulsoup4 HTML解析器详细用法

it2023-12-05  123

1 import re 2 3 from bs4 import BeautifulSoup, Comment 4 5 html_doc = """<html><head><title>The Dormouse's story</title></head> 6 <body> 7 <p class="title"><b>The Dormouse's story</b></p> 8 9 <p class="story">Once upon a time there were three little sisters; and their names were 10 <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>, 11 <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and 12 <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>; 13 and they lived at the bottom of a well.</p> 14 15 <p class="story">...</p> 16 """ 17 # 或者打开某个html文件 soup = BeautifulSoup(open("index.html")) 18 soup = BeautifulSoup(html_doc, "html.parser") 19 20 tag = soup.p 21 # 获取标签的类型 22 print(type(tag)) # <class 'bs4.element.Tag'> 23 # 获取标签的名字 24 print(tag.name) # p 25 # 获取标签的class属性的值 26 print(tag['class']) # ['title'] 27 # 获取标签的所有属性 28 print(tag.attrs) # {'class': ['title']} 29 30 css_soup = BeautifulSoup('<p class="body strikeout"></p>', "html.parser") 31 # 获取多值属性 32 print(css_soup.p['class']) # ['body', 'strikeout'] 33 # 获取标签内容 34 print(tag.string) # The Dormouse's story 35 # 获取标签内容的类型,字符串用NavigableString来包装 36 print(type(tag.string)) # <class 'bs4.element.NavigableString'> 37 38 # 将标签的内容替换 39 tag.string.replace_with("No longer bold") 40 print(tag) # <p class="title"><b>No longer bold</b></p> 41 42 # BeautifulSoup 对象表示的是一个文档的全部内容 43 print(soup.name) # [document] 44 45 markup = "<b><!--Hey, buddy. Want to buy a used parser?--></b>" 46 soup = BeautifulSoup(markup, "html.parser") 47 comment = soup.b.string 48 # 获取注释的类型Comment,Comment 对象是一个特殊类型的 NavigableString 对象 49 print(type(comment)) # <class 'bs4.element.Comment'> 50 # 以漂亮的格式输出 51 # <b> 52 # <!--Hey, buddy. Want to buy a used parser?--> 53 # </b> 54 print(soup.b.prettify()) 55 56 # 遍历文档树 57 soup = BeautifulSoup(html_doc, "html.parser") 58 # 直接获取head节点 59 print(soup.head) # <head><title>The Dormouse's story</title></head> 60 # 直接获取title节点 61 print(soup.title) # <title>The Dormouse's story</title> 62 # 获取body元素内的第一个b标签 63 print(soup.body.b) # <b>The Dormouse's story</b> 64 # 获取文档内的第一个a标签 65 print(soup.a) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> 66 # 获取文档内的所有a标签 67 # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 68 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 69 # id="link3">Tillie</a>] 70 print(soup.find_all('a')) 71 # 通过.contents 获取head标签的所有子节点,以列表形式返回 72 print(soup.head.contents) # [<title>The Dormouse's story</title>] 73 # 通过.children获取head的所有子节点 74 # 结果:<title>The Dormouse's story</title> 75 for child in soup.head.children: 76 print(child) 77 # 通过.descendants获取head的所有子孙节点 78 # 结果: 79 # <title>The Dormouse's story</title> 80 # The Dormouse's story 81 for child in soup.head.descendants: 82 print(child) 83 # 获取整个文档的子节点也就是一个html节点 84 print(len(list(soup.children))) # 1 85 # 获取整个文档的所有子孙节点的数量 86 print(len(list(soup.descendants))) # 26 87 # 获取文档内的所有的字符串 88 # 结果: 89 # "The Dormouse's story" 90 # '\n' 91 # '\n' 92 # "The Dormouse's story" 93 # '\n' 94 # 'Once upon a time there were three little sisters; and their names were\n' 95 # 'Elsie' 96 # ',\n' 97 # 'Lacie' 98 # ' and\n' 99 # 'Tillie' 100 # ';\nand they lived at the bottom of a well.' 101 # '\n' 102 # '...' 103 # '\n' 104 for string in soup.strings: 105 print(repr(string)) 106 # 获取文档内的所有字符串,去除多余的空白 107 # 结果: 108 # "The Dormouse's story" 109 # "The Dormouse's story" 110 # 'Once upon a time there were three little sisters; and their names were' 111 # 'Elsie' 112 # ',' 113 # 'Lacie' 114 # 'and' 115 # 'Tillie' 116 # ';\nand they lived at the bottom of a well.' 117 # '...' 118 for string in soup.stripped_strings: 119 print(repr(string)) 120 121 title_tag = soup.title 122 # 通过.parent 查找title节点的父节点 123 print(title_tag.parent) # <head><title>The Dormouse's story</title></head> 124 # 获取所有title_tag的父节点 125 # 结果: 126 # head 127 # html 128 # [document] 129 for parent in title_tag.parents: 130 print(parent.name) 131 132 sibling_soup = BeautifulSoup("<a><b>text1</b><c>text2</c></b></a>", "html.parser") 133 # 通过.next_sibling获取节点的下一个兄弟节点 134 print(sibling_soup.b.next_sibling) # <c>text2</c> 135 print(sibling_soup.c.next_sibling) # None 136 # 通过.previous_sibling获取节点的前一个系统第节点 137 print(sibling_soup.c.previous_sibling) # <b>text1</b> 138 print(sibling_soup.b.previous_sibling) # None 139 # 通过.next_siblings查找a标签的所有兄弟节点 140 # 结果: 141 # ',\n' 142 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 143 # ' and\n' 144 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> 145 # ';\nand they lived at the bottom of a well.' 146 for sibling in soup.a.next_siblings: 147 print(repr(sibling)) 148 # 通过.previous_siblings查找id为link3的所有前置兄弟节点 149 # 结果: 150 # ' and\n' 151 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 152 # ',\n' 153 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> 154 # 'Once upon a time there were three little sisters; and their names were\n' 155 for sibling in soup.find(id="link3").previous_siblings: 156 print(repr(sibling)) 157 158 last_a_tag = soup.find("a", id="link3") 159 # 查找最后一个a标签的下一个被解析的对象 160 # 和next_sibling 区别在于是被解析的下一个对象,不是下一个对象 161 print(last_a_tag.next_element) # Tillie 162 # 查找最后一个a标签的上一个被解析的对象 163 print(repr(last_a_tag.previous_element)) # ' and\n' 164 # 查找最后一个a标签之后的所有被解析对象 165 # 结果: 166 # 'Tillie' 167 # ';\nand they lived at the bottom of a well.' 168 # '\n' 169 # <p class="story">...</p> 170 # '...' 171 # '\n' 172 for element in last_a_tag.next_elements: 173 print(repr(element)) 174 175 # 查找文档的所有b标签 176 print(soup.find_all('b')) # [<b>The Dormouse's story</b>] 177 # 查找所有以b开头的标签 178 # 结果: 179 # body 180 # b 181 for tag in soup.find_all(re.compile("^b")): 182 print(tag.name) 183 # 查找所有包含t的标签 184 # 结果: 185 # html 186 # title 187 for tag in soup.find_all(re.compile("t")): 188 print(tag.name) 189 # 传入一个列表查找元素 190 # 结果:[<b>The Dormouse's story</b>, <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 191 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" 192 # href="http://example.com/tillie" id="link3">Tillie</a>] 193 print(soup.find_all(['a', 'b'])) 194 # 匹配所有元素,但是不会返回字符串节点 195 for tag in soup.find_all(True): 196 print(tag.name) 197 198 199 # 定义过滤方法 200 def has_class_but_no_id(tag): 201 return tag.has_attr('class') and not tag.has_attr('id') 202 203 204 # 通过自定义方法实现过滤 205 # 结果: [<p class="title"><b>The Dormouse's story</b></p>, <p class="story">Once upon a time there were 206 # three little sisters; and their names were <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 207 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and <a class="sister" 208 # href="http://example.com/tillie" id="link3">Tillie</a>; and they lived at the bottom of a well.</p>, 209 # <p class="story">...</p>] 210 print(soup.find_all(has_class_but_no_id)) 211 # 查找id为link2的元素 212 print(soup.find_all(id='link2')) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 213 # 查找href包含elsie的元素 214 print(soup.find_all(href=re.compile("elsie"))) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 215 # 查找所有包含id属性的元素 216 # 结果: [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 217 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 218 # id="link3">Tillie</a>] 219 print(soup.find_all(id=True)) 220 # 多条件查找元素 221 # 结果: 222 # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 223 print(soup.find_all(href=re.compile("elsie"), id='link1')) 224 225 data_soup = BeautifulSoup('<div data-foo="value">foo!</div>', 'html.parser') 226 # 对于一些特殊的属性,可以通过attrs的形式查找标签 227 print(data_soup.find_all(attrs={"data-foo": "value"})) # [<div data-foo="value">foo!</div>] 228 # 通过css类名查找元素,因为class是python的关键字,所以用class_代替、 229 # 结果: [<a class="sister" href="http://example.com/elsie" 230 # id="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" 231 # href="http://example.com/tillie" id="link3">Tillie</a>] 232 print(soup.find_all('a', class_='sister')) 233 # class_也可以用正则来过滤 234 print(soup.find_all(class_=re.compile("itl"))) # [<p class="title"><b>The Dormouse's story</b></p>] 235 236 237 def has_six_characters(css_class): 238 return css_class is not None and len(css_class) == 6 239 240 241 # 通过自定义过滤方法过滤元素 242 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 243 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 244 # id="link3">Tillie</a>] 245 print(soup.find_all(class_=has_six_characters)) 246 # 查找文档中的字符串为Elsie的 247 print(soup.find_all(text="Elsie")) # ['Elsie'] 248 # 正则表达式查找text 249 print(soup.find_all(text=re.compile("Dormouse"))) # ["The Dormouse's story", "The Dormouse's story"] 250 # 通过limit限制返回的结果集数量 251 # 结果: 252 # [<a class="sister" href="http://example.com/elsie" id 253 # ="link1">Elsie</a>, <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 254 print(soup.find_all("a", limit=2)) 255 # 默认会查找文档的所有子孙节点,如果recursive指定为False则只会查找子节点 256 print(soup.find_all('title', recursive=False)) # [] 257 # 等价于 soup.find_all("a") 258 print(soup("a")) 259 # 等价于 soup.title.find_all(text=True) 260 print(soup.title(text=True)) 261 # find用法与find_all用法基本一致,区别如下: 262 # 1、find返回找到元素的第一个元素,find_all返回所有 263 # 2、如果没有找到元素,find返回None,find_all返回空集合 264 print(soup.find("a")) # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> 265 266 a_string = soup.find(text='Lacie') 267 # 找到a_string元素的父节点是a的所有元素 268 print(a_string.find_parents("a")) # [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>] 269 # 找到a_string元素的父节点是p的第一个元素 270 # 结果: 271 # <p class="story">Once upon a time there were three little sisters; and their names were 272 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 273 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and 274 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; 275 # and they lived at the bottom of a well.</p> 276 print(a_string.find_parent("p")) 277 # 查找a_string元素的父节点是p,class为title的所有元素 278 print(a_string.find_parents("p", class_="title")) # [] 279 280 first_link = soup.a 281 # 查找第一个a标签的所有是a的兄弟元素 282 # 结果: [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" 283 # href="http://example.com/tillie" id="link3">Tillie</a>] 284 print(first_link.find_next_siblings("a")) 285 286 first_story_paragraph = soup.find("p", "story") 287 # 查找first_story_paragraph的下一个标签的p的兄弟标签 288 print(first_story_paragraph.find_next_sibling("p")) # <p class="story">...</p> 289 290 last_link = soup.find("a", id="link3") 291 # 查找last_link的前一个标签是a的所有兄弟标签 292 # 结果: [<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 293 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 294 print(last_link.find_previous_siblings("a")) 295 # 查找last_link的前一个标签是a的兄弟标签 296 print(last_link.find_previous_sibling("a")) # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> 297 298 first_link = soup.a 299 # 查找first_link之后的所有有字符串的节点 300 # 结果: ['Elsie', ',\n', 'Lacie', ' and\n', 'Tillie', ';\nand they lived at the bottom of a well.', '\n', '...', '\n'] 301 print(first_link.find_all_next(text=True)) 302 # 查找first_link之后的第一个p标签 303 print(first_link.find_next("p")) # <p class="story">...</p> 304 # 查找first_link之前的所有p标签 305 # 结果:[<p class="story">Once upon a time there were three little sisters; and their names were 306 # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, 307 # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> and 308 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; 309 # and they lived at the bottom of a well.</p>, <p class="title"><b>The Dormouse's story</b></p>] 310 print(first_link.find_all_previous("p")) 311 # 查找first_link的前一个title元素 312 print(first_link.find_previous("title")) # <title>The Dormouse's story</title> 313 314 315 # CSS 选择器 316 # 通过css选择器来查找标签为title的元素 317 print(soup.select("title")) # [<title>The Dormouse's story</title>] 318 # 查找是p元素的第三个元素 319 print(soup.select("p:nth-of-type(3)")) # [<p class="story">...</p>] 320 # 逐级查找body下的所有a标签 321 # 结果: [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 322 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 323 # id="link3">Tillie</a>] 324 print(soup.select("body a")) 325 # 逐级查找html下的head虾的title元素 326 print(soup.select("html head title")) # [<title>The Dormouse's story</title>] 327 # 查找head元素下的直接子title元素 328 print(soup.select("head > title")) # [<title>The Dormouse's story</title>] 329 # 查找p元素下子元素id为link1的元素 330 print(soup.select("p > #link1")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 331 # 查找body下的子元素为a的元素,不会逐级查找 332 print(soup.select("body > a")) # [] 333 # 查找id为link1的所有class为sister的兄弟节点 334 # 结果:[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>, 335 # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] 336 print(soup.select("#link1 ~ .sister")) 337 # 通过css类型sister查找元素 338 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 339 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 340 # id="link3">Tillie</a>] 341 print(soup.select(".sister")) 342 # 通过id来查找元素 343 print(soup.select("#link1")) # [<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 344 # 查找所有a标签包含href属性的 345 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 346 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 347 # id="link3">Tillie</a>] 348 print(soup.select("a[href]")) 349 # 根据a标签的href属性值查找元素 350 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>] 351 print(soup.select('a[href="http://example.com/elsie"]')) 352 # 根据a标签的href前缀查找元素 353 # 结果:[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, <a class="sister" 354 # href="http://example.com/lacie" id="link2">Lacie</a>, <a class="sister" href="http://example.com/tillie" 355 # id="link3">Tillie</a>] 356 print(soup.select('a[href^="http://example.com"]')) 357 # 查找所有a标签的href值是以tillie结尾的 358 # 结果:[<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>] 359 print(soup.select('a[href$="tillie"]')) 360 # 查找所有href的值与表达式相匹配的a标签 361 print(soup.select('a[href*=".com/el"]')) 362 363 364 # 修改文档树 365 soup = BeautifulSoup('<b class="boldest">Extremely bold</b>', "html.parser") 366 tag = soup.b 367 # 修改标签的name 368 tag.name = "blockquote" 369 # 修改标签的class 370 tag['class'] = "verybold" 371 # 新增标签的id属性 372 tag['id'] = 1 373 print(tag) # <blockquote class="verybold" id="1">Extremely bold</blockquote> 374 # 通过.string修改标签的内容 375 tag.string = "New link text." 376 print(tag) # <blockquote class="verybold" id="1">New link text.</blockquote> 377 378 soup = BeautifulSoup("<a>Foo</a>", "html.parser") 379 # 对指定标签增加内容 380 soup.a.append("Bar") 381 print(soup.a) # <a>FooBar</a> 382 # 通过new_string()方法创建一个字符串对象 383 new_string = soup.new_string("New content") 384 soup.a.append(new_string) 385 print(soup.a) # <a>FooBarNew content</a> 386 # 创建一个注释对象 387 new_comment = soup.new_string("I am comment.", Comment) 388 soup.a.append(new_comment) 389 print(soup.a) # <a>FooBarNew content<!--I am comment.--></a> 390 391 soup = BeautifulSoup("<b></b>", "html.parser") 392 original_tag = soup.b 393 # 通过new_tag()方法创建一个新的标签 394 new_tag = soup.new_tag("a", href="http://www.example.com") 395 original_tag.append(new_tag) 396 print(original_tag) # <b><a href="http://www.example.com"></a></b> 397 398 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 399 soup = BeautifulSoup(markup, "html.parser") 400 tag = soup.a 401 # 通过insert()方法将制定内容插入对应的下标下 402 tag.insert(1, "but did not endorse") 403 print(tag) # <a href="http://example.com/">I linked to but did not endorse<i>example.com</i></a> 404 405 soup = BeautifulSoup("<b>stop</b>", "html.parser") 406 tag = soup.new_tag("i") 407 tag.string = "Don't" 408 # 通过insert_before()方法在当前tag或者文本节点前插入内容 409 soup.b.string.insert_before(tag) 410 print(soup) # <b><i>Don't</i>stop</b> 411 # 通过insert_after() 方法在当前tag或文本节点后插入内容 412 soup.b.i.insert_after(soup.new_string(" no no ")) 413 print(soup) # <b><i>Don't</i> no no stop</b> 414 415 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 416 soup = BeautifulSoup(markup, 'html.parser') 417 tag = soup.a 418 # 通过clear() 方法移除当前tag的内容 419 tag.clear() 420 print(tag) # <a href="http://example.com/"></a> 421 422 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 423 soup = BeautifulSoup(markup, 'html.parser') 424 a_tag = soup.a 425 # 通过extract() 方法将当前tag移除文档树,并作为方法结果返回 426 i_tag = soup.i.extract() 427 print(a_tag) # <a href="http://example.com/">I linked to </a> 428 print(i_tag) # <i>example.com</i> 429 430 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 431 soup = BeautifulSoup(markup, 'html.parser') 432 a_tag = soup.a 433 # 通过decompose() 方法将当前节点移除文档树并完全销毁 434 i_tag = soup.i.decompose() 435 print(a_tag) # <a href="http://example.com/">I linked to </a> 436 print(i_tag) # None 437 438 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 439 soup = BeautifulSoup(markup, 'html.parser') 440 a_tag = soup.a 441 new_tag = soup.new_tag("b") 442 new_tag.string = "example.net" 443 # 通过replace_with() 方法移除文档树中的某段内容,并用新tag或文本节点替代它 444 a_tag.i.replace_with(new_tag) 445 print(a_tag) # <a href="http://example.com/">I linked to <b>example.net</b></a> 446 447 soup = BeautifulSoup("<p>I wish I was bold.</p>", 'html.parser') 448 # 通过wrap() 方法可以对指定的tag元素进行包装 449 soup.p.string.wrap(soup.new_tag("b")) 450 print(soup) # <p><b>I wish I was bold.</b></p> 451 452 markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>' 453 soup = BeautifulSoup(markup, 'html.parser') 454 a_tag = soup.a 455 # unwrap() 方法与 wrap() 方法相反.将移除tag内的所有tag标签,该方法常被用来进行标记的解包 456 a_tag.i.unwrap() 457 print(a_tag) # <a href="http://example.com/">I linked to example.com</a> 458 459 markup = '<a href="http://example.com/">\nI linked to <i>example.com</i>\n</a>' 460 soup = BeautifulSoup(markup, 'html.parser') 461 # 如果只想得到tag中包含的文本内容,那么可以嗲用 get_text() 方法,这个方法获取到tag中包含的所有文版内容包括子孙tag中的内容,并将结果作为Unicode字符串返回: 462 print(repr(soup.get_text())) # '\nI linked to example.com\n' 463 # 可以通过参数指定tag的文本内容的分隔符 464 print(repr(soup.get_text("|"))) # '\nI linked to |example.com|\n' 465 # 还可以去除获得文本内容的前后空白 466 print(repr(soup.get_text("|", strip=True))) # 'I linked to|example.com'

 

转载于:https://www.cnblogs.com/forfreewill/articles/9989317.html

相关资源:各显卡算力对照表!
最新回复(0)