xpath解析
- 最常用切最便捷高效的一种方式
- xpath解析原理:
- 实例化一个etree的对象,且需要将被解析的页面源码数据加载到该对象中
- 调用etree对象中的xpath方法结合xpath表达式实现标签的定位和内容的捕获
- 环境安装
- 如何实例化一个etree对象
- 导包:from lxml import etree
- 将本地的HTML文档中的数据加载到该对象中:
- etree.parse(filePath,etree.HTMLParser())
- 解析本地本件第二个参数最好加上,不然可能报错
- 可以将互联网上获取的源码数据加载到该对象中
- xpath(‘xpath表达式’)
- 标签定位:
# 标签的定位
#最前面的/表示从根节点开始
# 一个标签返回一个element对象
r=tree.xpath('/html/head/title')
- 多层级定位
#一个//表示一个多层级,也可以表示从任意位置开始定位
r=tree.xpath('/html//title')
- 精准定位
#精准定位class为song的divs
r = tree.xpath('//div[@class="ong"]')
- 索引定位
# 索引定位,返回第几个元素,且索引从1开始
r = tree.xpath('//div[@class="song"]/p[3]')
- 取直系文本
#取文本,text()返回的是一个列表,取得是直系内容
r = tree.xpath('//div[@class="song"]//li[5]/a/text()')
- 取非直系文本
#获取标签中非直系的文本内容
r = tree.xpath('//li[7]//text()')
- 取属性值
#取属性值
r = tree.xpath('//div[@class="song"]/img/@src')[0]
- 以上所有xpath方法返回的都是列表
xpath实战之爬取58二手房
import requests
from lxml import etree
if __name__=='__main__':
# 获取页面源码数据
url='https://bj.58.com/ershoufang/'
# UA伪装
head={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text=requests.get(url=url,headers=head).text
#数据解析
tree =etree.HTML(page_text)
list=tree.xpath('//div[@class="property-content-detail"]')
fp=open('58.txt','w',encoding='utf-8')
for h3 in list:
#./表示定位到的div标签
title=h3.xpath('.//text()')[0]
print(title)
fp.write(title+'\n')
fp.close()
xpath实战之4k图片解析下载
import requests
from lxml import etree
import os
if __name__=='__main__':
# 获取页面源码数据
url='https://pic.netbian.com/4kmeinv/'
# UA伪装
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
response = requests.get(url=url, headers=head)
#手动给响应数据设置编码
# response.encoding='gbk'
page_text=response.text
tree=etree.HTML(page_text)
li_list=tree.xpath('//div[@class="slist"]/ul/li')
if not os.path.exists('./picLibs'):
os.mkdir('./picLibs')
for li in li_list:
img_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
# 通用解决中文乱码的解决方案
img_name=img_name.encode('iso-8859-1').decode('gbk')
# print(img_name,img_src)
img_data=requests.get(url=img_src,headers=head).content
img_path='picLibs/'+img_name
with open(img_path,'wb')as fp:
fp.write(img_data)
print(img_name+'下载完成!!')
xpath实战之全国城市名称爬取
import requests
from lxml import etree
import os
if __name__=='__main__':
# 获取页面源码数据
url='https://www.aqistudy.cn/historydata/'
# UA伪装
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text = requests.get(url=url, headers=head).text
tree = etree.HTML(page_text)
host_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
all_city_names=[]
# 也可以一次获取全部
# tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
#解析热门城市名称
for li in host_li_list:
host_city_name=li.xpath('./a/text()')[0]
all_city_names.append(host_city_name)
city_names_list=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
#解析全部城市名称
for li in city_names_list:
city_name=li.xpath('./a/text()')[0]
all_city_names.append(city_name)
print(all_city_names,len(all_city_names))
xpath实战之图片爬取
import requests
import os
from lxml import etree
if __name__=='__main__':
lxm=0
url = 'https://www.vilipix.com/ranking'
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
}
page_text=requests.get(url=url,headers=head).text
tree = etree.HTML(page_text)
img_list=tree.xpath('//div[@class="title"]/a')
if not os.path.exists('./p站'):
os.mkdir('./p站')
for i in img_list:
img_url='https://www.vilipix.com'+i.xpath('./@href')[0]
img_data = requests.get(url=img_url, headers=head).text
ptree=etree.HTML(img_data)
p_list=ptree.xpath('//a[@href="javascript: void(0)"]/img')
for img in p_list:
lxm+=1
img_p=img.xpath('./@src')[0]
pp=requests.get(url=img_p,headers=head).content
img_name=img.xpath('./@alt')[0]+str(lxm)+'.jpg'
img_path='p站/'+img_name
img_path=img_path.replace("?","L")
with open(img_path,'wb') as fp:
fp.write(pp)
print(img_name+'下载完成!!')
print("over!!!!!")
|