xpath 解析
解析本地文件
xpath的返回值是一个列表型数据 xpath基本语法
- 路径查询
– // :查找所有子孙节点 – / :找直接子节点 - 谓词查询
– //div[@id] – //div[@id=‘maincontent’] - 属性查询
– //@class - 模糊查询
– //div[contains(@id),“ci”] – //div[starts-with(@id),“he”] - 内容查询
– //div/h1/text() - 逻辑运算
– //div[@id=“head” and @class=“s_down”] – //div[@id=“head” and @class=“s_down”] 另一种写法 //title | //price
from lxml import etree
tree = etree.parse('F:/Temp/img/New_file.html')
li_list = tree.xpath('//body/ul/li')
li_list = tree.xpath('//ul/li[@id]/text()')
li_list = tree.xpath('//ul/li[@id="l1"]/text()')
li = tree.xpath('//ul/li[@id="l1"]/@class')
li = tree.xpath('//ul/li[@id="l1"]/@class')
li_list = tree.xpath('//ul/li[contains(@id,"l")]/text()')
li_list = tree.xpath('//ul/li[starts-with(@id,"u")]/text()')
li_list = tree.xpath('//ul/li[@id="l1" and @class="c2"]/text()')
li_list = tree.xpath('//ul/li[@id="l3" or @id="l4"]/text()')
print(li_list)
New_file.html文件:
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<ul>
<li id="l1" class='c2'>北京</li>
<li id="l2">上海</li>
<li class='c1'>广州</li>
<li class='c2'>深圳</li>
</ul>
<ul>
<li id="l3">成都</li>
<li id="u1">西安</li>
<li id="l4">昆明</li>
<li id="u2">杭州</li>
</ul>
</body>
</html>
解析网页
对网页进行解析,然后获取对应位置的内容。这样就不需要下载网页文件 给浏览器安装一个【XPath Helper】插件,安装好后,快捷键【Ctrl+Shift+x】可在网页顶部生成一左一右的两个黑色框。在左侧输入xpath路径,可在右侧查看相应内容。这能方便开发者。
import urllib.request
from lxml import etree
url = 'https://www.baidu.com'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
res = urllib.request.urlopen(request)
content = res.read().decode('utf-8')
tree = etree.HTML(content)
result = tree.xpath('//input[@id="su"]/@value')
print(result)
下载图片
在站长素材下载图片,要求是前十页的美食图片
import urllib.request
from lxml import etree
''' 先看看每一页的网址
https://sc.chinaz.com/tupian/meishitupian.html
https://sc.chinaz.com/tupian/meishitupian_3.html
https://sc.chinaz.com/tupian/meishitupian_5.html
https://sc.chinaz.com/tupian/meishitupian_10.html
'''
def createRequest(page):
if page==1:
url = 'https://sc.chinaz.com/tupian/meishitupian.html'
else:
url = 'https://sc.chinaz.com/tupian/meishitupian_'+str(page)+'.html'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def getContent(request):
res = urllib.request.urlopen(request)
content = res.read().decode('utf-8')
return content
def Download(content):
tree = etree.HTML(content)
name_list = tree.xpath('//div[@id="container"]//a/img/@alt')
src_list = tree.xpath('//div[@id="container"]//a/img/@src2')
for i in range(len(name_list)):
name = name_list[i]
src = src_list[i]
url = 'https:'+src[:-6]+'.jpg'
urllib.request.urlretrieve(url=url,filename='F:/Temp/Pachong/'+name+".jpg")
if __name__ == "__main__":
start_page = 1
end_page = 5
for page in range(start_page,end_page+1):
request = createRequest(page)
content = getContent(request)
Download(content)
JsonPath 解析
解析本地文件
JsonPath解析只能解析本地文件,所以网页资源需要先下载。 参考资料:https://blog.csdn.net/fu_huo_1993/article/details/88350147
import json
import jsonpath
obj = json.load(open('store.json','r',encoding='utf-8'))
author_list = jsonpath.jsonpath(obj,'$.stroe.book[*].author')
author_list = jsonpath.jsonpath(obj,'$.stroe..author')
price_list = jsonpath.jsonpath(obj,'$.stroe..price')
tag_list = jsonpath.jsonpath(obj,'$.stroe.*')
book = jsonpath.jsonpath(obj,'$..book[2]')
book = jsonpath.jsonpath(obj,'$..book[(@.length-1)]')
books = jsonpath.jsonpath(obj,'$..book[0,1]')
books = jsonpath.jsonpath(obj,'$..book[:2]')
book_list = jsonpath.jsonpath(obj,'$..book[?(@.isbn)]')
book_list = jsonpath.jsonpath(obj,'$..book[?(@.price>10)]')
store.json 文件
{ "store": {
"book": [
{ "category": "修真",
"author": "六道",
"title": "坏蛋是怎样练成的",
"price": 8.95
},
{ "category": "修改",
"author": "天蚕土豆",
"title": "斗破苍穹",
"price": 12.99
},
{ "category": "修真",
"author": "唐家三少",
"title": "斗罗大陆",
"isbn": "0-553-21311-3",
"price": 8.99
},
{ "category": "修真",
"author": "南派三叔",
"title": "星辰变",
"isbn": "0-395-19395-8",
"price": 22.99
}
],
"bicycle": {
"color": "黑色",
"price": 19.95
}
}
}
解析淘票票覆盖城市
import json
import jsonpath
import urllib.request
url = 'https://dianying.taobao.com/cityAction.json?activityId&_ksTS=1629789477003_137&jsoncallback=jsonp138&action=cityAction&n_s=new&event_submit_doGetAllRegion=true'
headers = {
'accept': 'text/javascript, application/javascript, application/ecmascript, application/x-ecmascript, */*; q=0.01',
'accept-language': 'zh-CN,zh;q=0.9',
'cookie': 'cna=UkO6F8VULRwCAXTqq7dbS5A8; miid=949542021157939863; sgcookie=E100F01JK9XMmyoZRigjfmZKExNdRHQqPf4v9NIWIC1nnpnxyNgROLshAf0gz7lGnkKvwCnu1umyfirMSAWtubqc4g%3D%3D; tracknick=action_li; _cc_=UIHiLt3xSw%3D%3D; enc=dA18hg7jG1xapfVGPHoQCAkPQ4as1%2FEUqsG4M6AcAjHFFUM54HWpBv4AAm0MbQgqO%2BiZ5qkUeLIxljrHkOW%2BtQ%3D%3D; hng=CN%7Czh-CN%7CCNY%7C156; thw=cn; _m_h5_tk=3ca69de1b9ad7dce614840fcd015dcdb_1629776735568; _m_h5_tk_enc=ab56df54999d1d2cac2f82753ae29f82; t=874e6ce33295bf6b95cfcfaff0af0db6; xlly_s=1; cookie2=13acd8f4dafac4f7bd2177d6710d60fe; v=0; _tb_token_=e65ebbe536158; tfstk=cGhRB7mNpnxkDmUx7YpDAMNM2gTGZbWLxUZN9U4ulewe025didli6j5AFPI8MEC..; l=eBrgmF1cOsMXqSxaBO5aFurza77tzIRb8sPzaNbMiInca6OdtFt_rNCK2Ns9SdtjgtfFBetPVKlOcRCEF3apbgiMW_N-1NKDSxJ6-; isg=BBoas2yXLzHdGp3pCh7XVmpja8A8S54lyLj1RySTHq14l7vRDNufNAjpZ2MLRxa9',
'referer': 'https://dianying.taobao.com/',
'sec-ch-ua': '"Chromium";v="92", " Not A;Brand";v="99", "Google Chrome";v="92"',
'sec-ch-ua-mobile': '?0',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-origin',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'x-requested-with': 'XMLHttpRequest',
}
request = urllib.request.Request(url = url, headers = headers)
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
content = content.split('(')[1].split(')')[0]
with open('jsonpath解析淘票票.json','w',encoding='utf-8')as fp:
fp.write(content)
obj = json.load(open('jsonpath解析淘票票.json','r',encoding='utf-8'))
city_list = jsonpath.jsonpath(obj,'$..regionName')
print(city_list)
BeautifulSoup 解析
BeautifulSoup跟xpath类似,即能解析本地文件也能解析网页。
基本语法
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('F:/Temp/img/New_file.html','r',encoding='utf-8'),'lxml')
print(soup.a)
print(soup.a.attrs)
print(soup.find('a'))
print(soup.find('a',title='a2'))
print(soup.find('a',class_='a1'))
print('-'*40)
print(soup.find_all('a'))
print(soup.find_all(['a','span']))
print(soup.find_all('li',limit=2))
print('--------------------------------')
print(soup.select('a'))
print(soup.select('.a1'))
print(soup.select('#l1'))
print(soup.select('li[id]'))
print(soup.select('li[id="l2"]'))
print('-----------------------------')
print(soup.select('div li'))
print(soup.select('div > ul > li'))
print(soup.select('a,li'))
print(soup.select('#app')[0].string)
print(soup.select('#app')[0].get_text())
print(soup.select("#p1")[0].name)
print(soup.select("#p1")[0].attrs)
print(soup.select('#p1')[0].attrs.get('class'))
print(soup.select('#p1')[0].get('class'))
print(soup.select('#p1')[0]['class'])
New_file.html文件
<html>
<head>
<meta charset="utf-8"/>
<title></title>
</head>
<body>
<div>
<ul>
<li id="l1">白起</li>
<li id="l2">廉颇</li>
<li class='c1'>李牧</li>
<li class='c2'>王翦</li>
<a href="www.csdn.com" id="" class="a1">CSDN</a>
<span>你大爷</span>
</ul>
</div>
<a href="www.baidu.com" title="a2">百度</a>
<div id='app'>
<span>老大爷</span>
</div>
<p id="p1" class="p1">爬虫</p>
</body>
</html>
爬星巴克菜单(图片和名称)
import urllib.request
from bs4 import BeautifulSoup
url = 'https://www.starbucks.com.cn/menu/'
res = urllib.request.urlopen(url)
content = res.read().decode('utf-8')
soup = BeautifulSoup(content,'lxml')
name_list = soup.select('ul[class="grid padded-3 product"] strong')
src_list = soup.select('ul[class="grid padded-3 product"] div')
for i in range(len(name_list)):
name = name_list[i].get_text()
if name.find('/') > 0:
name = name.replace('/', '')
src= str(src_list[i].attrs.get('style')).split("\"/")[1].split("\")")[0]
url = 'https://www.starbucks.com.cn/'+src
urllib.request.urlretrieve(url=url, filename='F:/Temp/Pachong/' + name + ".jpg")
|