from lxml import etree
if __name__ == '__main__':
# parse 返回的是 文件的根节点 Returns the root element of the given source document.
tree = etree.parse(source="index.html", parser=etree.HTMLParser())
# print(etree.tostring(tree))
# ex01 提取所有的 li 节点
# // 从当前节点选取子孙节点,从文档的根节点开始,查找名称为 li 的子孙节点。
# results = tree.xpath("//li")
# print(type(results))
# for result in results:
# # print(type(result))
# print(etree.tostring(result))
# ex02 提取所有带有 class 属性的 li 节点
# 属性写到元素的后面,并且需要使用[] 包起来。
# results = tree.xpath("//li[@class]")
# for result in results:
# print(etree.tostring(result))
# ex03 提取所有的 class 属性为 item-0 的 li 节点
# 对于字符串的嵌套,需要使用不同的引号。python 中有单引号、双引号、三引号可以表示字符串。
# results = tree.xpath("//li[@class='item-0']")
# for result in results:
# print(etree.tostring(result))
# ex04 提取所有的 li 节点中 class 属性的值
# results = tree.xpath("//li/@class")
# for result in results:
# # print(type(result))
# print(result)
# ex05 提取所有的 class 属性为 item-1 的 li 节点中的文本内容
# results = tree.xpath("//li[@class='item-1']/a/text()")
# results = tree.xpath("//li[@class='item-1']//text()")
# for result in results:
# # print(etree.tostring(result))
# print(result)
# ex06 提取 ul 下的第 1 个 li 元素
# xpath 中的索引从 1 开始
# results = tree.xpath("//ul/li[1]")
# for result in results:
# print(etree.tostring(result))
# ex07 提取 ul 下的前 3 个 li 元素
# results = tree.xpath("//ul/li[position() < 4]")
# for result in results:
# print(etree.tostring(result))
# ex08 提取 ul 下的最后 1 个 li 元素
# results = tree.xpath("//ul/li[last()]")
# for result in results:
# print(etree.tostring(result))
# ex09 提取 ul 下的最后 3 个 li 元素
# results = tree.xpath("//ul/li[position() > 2]")
# for result in results:
# print(etree.tostring(result))
# results = tree.xpath("//ul/li[position() > last()-3]")
# for result in results:
# print(etree.tostring(result))
# ex10 提取 class 属性值 item 的li 节点
# contains 函数第一个参数是属性,第二个是需要包含的值
# results = tree.xpath("//li[contains(@class, 'item')]")
# for result in results:
# print(etree.tostring(result))
# ex11 提取 class 属性的值为 item-0 或者 item-1 的所有的 li 节点
# 针对当前输入的特解,非正常写法
# results = tree.xpath("//li[contains(@class, 'item-0') or contains(@class, 'item-1')]")
# for result in results:
# print(etree.tostring(result))
# results = tree.xpath("//li[@class = 'item-0' or @class = 'item-1']")
# for result in results:
# print(etree.tostring(result))
# ex12 提取 class 属性的值为 item-0 或者 bold 的所有的节点
# results = tree.xpath("//li[@class='item-0'] | //span[@class='bold']")
# for result in results:
# print(etree.tostring(result))
# results = tree.xpath("//*[@class='item-0' or @class='bold']")
# for result in results:
# print(etree.tostring(result))
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>index</title>
</head>
<body>
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</body>
</html>
from lxml.etree import tostring, tostringlist,parse, HTMLParser
if __name__ == '__main__':
with open(file="index.html", mode="rt", encoding="utf-8") as f:
tree = parse(source=f, parser=HTMLParser())
# print(tostring(tree))
# ex01 获取所有的 li 节点
# results = tree.xpath("//li")
# print(type(results))
#
# for result in results:
# # print(type(result))
# print(tostring(result))
# ex02 获取所有的带有 class 属性的节点
# results = tree.xpath("//@class")
# for result in results:
# # print(type(result))
# print(result)
# ex03 获取所有的带有 class 属性的 li 节点--返回属性的值
# results = tree.xpath("//li/@class")
# for result in results:
# # print(type(result))
# print(result)
# ex04 获取所有的带有 class 属性的 li 节点--返回节点
# results = tree.xpath("//li[@class]")
# for result in results:
# # print(type(result))
# print(tostring(result))
# ex05 获取所有 class 属性为 item-0 的 li 节点--返回节点
# results = tree.xpath("//li[@class='item-0']")
# for result in results:
# # print(type(result))
# print(tostring(result))
# ex06 获取所有的带有 class 属性的 li 节点--返回节点中的文本
# results = tree.xpath("//li[@class]/a/text()")
# results = tree.xpath("//li[@class]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex07 获取所有的 class 属性包含 1 的 li 节点--返回节点中的文本
# results = tree.xpath("//li[contains(@class, '1')]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex08 获取所有的 class 属性以 item 开头的 li 节点--返回节点中的文本
# results = tree.xpath("//li[starts-with(@class, 'item')]//text()")
# for result in results:
# # print(type(result))
# print(result)
# 在 XPath 2.0 中支持 ends-with
# results = tree.xpath("//li[ends-with(@class, 'item')]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex09 获取前 3 个 li 节点----返回节点中的文本
# results = tree.xpath("//li[position() < 4]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex10 获取后 3 个 li 节点----返回节点中的文本
# results = tree.xpath("//li[position() > 2]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex11 获取第 1 个 li 节点----返回节点中的文本
# results = tree.xpath("//li[1]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex11 获取最后 1 个 li 节点----返回节点中的文本
# results = tree.xpath("//li[last()]//text()")
# for result in results:
# # print(type(result))
# print(result)
# ex12 获取所有的 class 属性wei item-0 和 item -1 的 li 节点----返回节点中的文本
results = tree.xpath("//li[@class='item-0' or @class='item-1']//text()")
for result in results:
# print(type(result))
print(result)
|