参考:https://cuiqingcai.com/5545.html
XPath
XML Path Language在XML文檔中查找信息,同樣適用于HTML文檔使用路徑選擇表達式的方式查找信息
XPath常用规则
nodename:选取次节点的所有子节点/:从当前节点选取直接子节点//: 从当前节点选取子孙节点.: 选取当前节点..: 选取当前节点的父节点@: 选取属性
text =
'''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
选取所有节点
from lxml
import etree
selector =
etree.HTML(text)
result = selector.xpath(
'//*')
print(result)
输出
[<Element html at 0x1761bfd5508>, <Element body at 0x1761bfd5a88>, <Element div at 0x1761bfd5ac8>, <Element ul at 0x1761bfd5b08>, <Element li at 0x1761bfd5e88>, <Element a at 0x1761bfd5f08>, <Element li at 0x1761bfd5f48>, <Element a at 0x1761bfd5f88>, <Element li at 0x1761bfd5fc8>, <Element a at 0x1761bfd5ec8>, <Element li at 0x1761bfdb048>, <Element a at 0x1761bfdb088>, <Element li at 0x1761bfdb0c8>, <Element a at 0x1761bfdb108>]
子节点
from lxml
import etree
selector =
etree.HTML(text)
result = selector.xpath(
'//li/a')
print(result)
输出
[<Element a at 0x1761c02dec8>, <Element a at 0x1761c02de88>, <Element a at 0x1761c02df08>, <Element a at 0x1761c02df48>, <Element a at 0x1761c02df88>]
父节点
from lxml
import etree
selector =
etree.HTML(text)
result = selector.xpath(
'//li/..')
print(result)
输出
[<Element ul at 0x1761ae7c288>]
属性匹配
from lxml
import etree
selector =
etree.HTML(text)
result = selector.xpath(
'//li[@class="item-0"]')
print(result)
输出
[<Element li at 0x1761afe2dc8>, <Element li at 0x1761c067748>]
注:[@class="item-0"]要使用双引号
文本获取
from lxml
import etree
selector =
etree.HTML(text)
result1 = selector.xpath(
'//li[@class="item-0"]/text()')
result2 = selector.xpath(
'//li[@class="item-0"]/a/text()')
print(result1)
print(result2)
输出
[
'\n ']
['first item',
'fifth item']
注://li[@class="item-0"]/text()得到['\n '] 因"/"是获取直接子节点
属性获取
from lxml
import etree
selector =
etree.HTML(text)
result = selector.xpath(
'//li[@class="item-0"]/a/@href')
print(result)
输出
[
'link1.html',
'link5.html']
属性多值匹配
from lxml
import etree
text1 =
'''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
selector =
etree.HTML(text1)
result1 = selector.xpath(
'//li[@calss="li"]/a/text()')
result2 = selector.xpath(
'//li[contains(@class,"li")]/a/text()')
print(result1)
print(result2)
输出
[]
['first item']
多属性匹配
from lxml
import etree
text2 =
'''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
selector =
etree.HTML(text2)
result = selector.xpath(
'//li[contains(@class,"li") and @name="item"]/a/text()')
print(result
输出
[
'first item']
按序选择
from lxml
import etree
text =
'''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
selector =
etree.HTML(text)
result1 = selector.xpath(
'//li[1]/a/text()')
print(result1)
result2 = selector.xpath(
'//li[last()]/a/text()')
print(result2)
result3 = selector.xpath(
'//li[position()<3]/a/text()')
print(result3)
result4 = selector.xpath(
'//li[last()-2]/a/text()')
print(result4)
输出
[
'first item']
['fifth item']
['first item',
'second item']
['third item']
节点轴选择
from lxml
import etree
text3 =
'''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
selector =
etree.HTML(text3)
result1 = selector.xpath(
'//li[1]/ancestor::*')
print(result1)
result2 = selector.xpath(
'//li[1]/ancestor::div')
print(result2)
result3 = selector.xpath(
'//li[1]/attribute::*')
print(result3)
result4 = selector.xpath(
'//child::a[@href="link1.html"]')
print(result4)
result5 = selector.xpath(
'//li[1]/descendant::span')
print(result5)
result6 = selector.xpath(
'//li[1]/following::*[2]')
print(result6)
result7 = selector.xpath(
'//li[1]/following-sibling::*')
print(result7)
输出
[<Element html at 0x1761c02db88>, <Element body at 0x1761c07bf08>, <Element div at 0x1761c078308>, <Element ul at 0x1761c086088>
]
[<Element div at 0x1761c078308>
]
['item-0']
[<Element a at 0x1761c086288>
]
[<Element span at 0x1761c06e6c8>
]
[<Element a at 0x1761c06e688>
]
[<Element li at 0x1761c078b08>, <Element li at 0x1761c078648>, <Element li at 0x1761c0864c8>, <Element li at 0x1761c086448>]
转载于:https://www.cnblogs.com/locke-hu/p/9236409.html