安装
pip install scrapy
导入
from scrapy.selector import Selector
待提取文本
content = """
<table class="tab">
<tr class="cdf">
1<a>tr下的第一个a标签</a>2
3<td class="cdf1">td1</td>4
5<td class="cdf2">td2</td>6
<td class="cdf3">td3的文本<a>td3的a标签</a></td>
<td class="cdf4">td4</td>
<td class="cdf5">td5</td>
<td class="cdf6">td6</td>
<a>tr下的第二个a标签</a>
</tr>
</table>
<div id='cdf-xyz'>table外的div</div>
"""
html = Selector(text=content)
使用
-
position()
:选取位置 -
not、and
:逻辑判断
-
parent、child
:父节点、子节点-
# td 的父节点 res2_parent = html.xpath("//td[@class='cdf1']/parent::*").get() # tr 的第4个 td子节点 res2_child = html.xpath("//tr[@class='cdf']/child::td[4]").get() print(res2_parent) print(res2_child)
-
-
following
: 选取文档中当前节点的结束标签之后的所有节点 -
preceding
: 选取文档中当前节点的开始标签之前的所有节点-
# 能取到 标签外面的 1 2 3 4 5 6 res3_following = html.xpath("//td[@class='cdf1']/following::text()").getall() res3_preceding = html.xpath("//td[@class='cdf1']/preceding::text()").getall() print(res3_following) print(res3_preceding)
-
-
string()
:获取节点下的所有文本
-
contains()
:包含
-
sibling
:兄弟节点-
# res6 = html.xpath("//td[@class='cdf5']/following-sibling::td").getall() res6 = html.xpath("//td[@class='cdf5']/preceding-sibling::td[2]").getall() print(res6)
-
-
re:match()
:正则表达式-
res7 = html.xpath("//td[re:match(@class, 'cdf\d+')]").extract() print(res7)
-