安装lxml模块
pip install lxml
入门案例
from lxml import etree
xml = """
<book>
<id>1</id>
<name>野花</name>
<price>123</price>
<nick>豆腐</nick>
<author>
<nick id="10086">周大枪</nick>
<nick id="10010">周芷若</nick>
<nick class="joy">周杰伦</nick>
<nick class="jolin">蔡依林</nick>
<div>
<nick>惹了</nick>
</div>
<span>
<nick>惹2</nick>
</span>
</author>
<partner>
<nick id="bbc">宝宝沉</nick>
<nick id="bbbc">宝宝不沉</nick>
</partner>
</book>
"""
tree = etree.XML(xml)
# /表示层级关系,第一个/是根节点,text()是拿文本
# ret = tree.xpath('/book/name/text()')
# // 表示所有后代
# ret = tree.xpath('/book/author//nick/text()') # ['周大枪', '周芷若', '周杰伦', '蔡依林', '惹了']
# *表示任意节点
ret = tree.xpath('/book/author/*/nick/text()') # ['惹了', '惹2']
print(ret)
语法
from lxml import etree
parser = etree.HTMLParser(encoding="utf-8")
tree = etree.parse('b.html', parser=parser)
# ret = tree.xpath('/html/body/ul/li/a/text()') # ['百度', '谷歌', '搜狗']
# ret = tree.xpath('/html/body/ul/li[1]/a/text()') # ['百度'] []:表示索引
ret = tree.xpath("/html/body/ol/li/a[@href='dapao']/text()") # ['大炮'] [@href='dapao']:对属性作筛选
print(ret)
ol_li_list = tree.xpath('/html/body/ol/li')
"""
['大炮']
['飞机']
['feiji']
['大炮']
['dapao']
['火车']
['huoche']
"""
for li in ol_li_list:
ret2 = li.xpath('./a/text()') # ./代表相对查找
print(ret2)
ret3 = li.xpath('./a/@href') # @属性:拿到属性值
print(ret3)
技巧
快速定位并复制后一行的xpath
?案例
import requests
from lxml import etree
url = 'xxxxxxxxxx' # 某网站
resp = requests.get(url)
# print(resp.text)
html = etree.HTML(resp.text)
divs = html.xpath('/html/body/div[6]/div/div/div[2]/div[5]/div[1]/div')
for div in divs:
price = div.xpath('./div/div/a[2]/div[2]/div[1]/span[1]/text()')[0].strip('¥')
amount = div.xpath('./div/div/a[2]/div[2]/div[1]/span[2]/text()')[0].strip('近半年成交:')
name = 'saas'.join(div.xpath('./div/div/a[2]/div[2]/div[2]/p/text()'))
company = (div.xpath('./div/div/a[1]/div[1]/p/text()')[1]).strip()
location = div.xpath('./div/div/a[1]/div[1]/div/span/text()')[0]
print('标题:{0}; 价格:{1}; 销量:{2}, 公司:{3}, 所在地:{4}'.format(name, price, amount, company, location))
resp.close()
?
|