爬虫实战–用xpath技术爬取猫眼电影的信息
前言
提示:以下是本篇文章正文内容,下面案例可供参考
一、步骤
(1)导入requests库和解析库 (2)定义发送请求函数 (3)定义解析数据函数 (4)定义保存数据函数 (5)定义主函数调用以上所有函数完成任务 (6)调用主函数执行程序
二、模块化代码
1.打结构
定义各个模块名进行方法实现的填写
def getHtml():
pass
def i_xpath(htmlList):
pass
def saveCsv(infos):
pass
def main():
调用
if __name__ == '__main__':
main()
2.使用解析xpath
**先确定网址,**只要写问号之前的就行了 调用解析工具 ctrl+shift+x
该处使用的url网络请求的数据。
总代码展示
import requests
import csv
from lxml import etree
def getHtml():
htmlList=[]
url="https://www.maoyan.com/board/4?"
headers={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36",
"Cookie":"__mta=150305673.1639141940631.1639142023606.1639276231473.6; uuid_n_v=v1; uuid=CCF7E53059BA11ECA89A6133DE3B9B2AE3587645357A4194974E491487C4EF49; _lxsdk_cuid=17da4791cd2c8-02be152f58b7bf-978153c-144000-17da4791cd2a1; _lxsdk=CCF7E53059BA11ECA89A6133DE3B9B2AE3587645357A4194974E491487C4EF49; _csrf=ae9be839c78ec78b8d2beb3573ccbc41ea677217e06d0feaf0dd8dabcfea2439; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1639141940,1639141951,1639276224; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; __mta=150305673.1639141940631.1639141951466.1639276224556.3; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1639276231; _lxsdk_s=17dac7a1fa5-627-a10-e8e%7C%7C5"
}
html_text=requests.get(url=url,headers=headers).content.decode("utf-8")
htmlList.append(html_text)
return htmlList
movieInfos=[]
def i_xpath(htmlList):
for i in htmlList:
source=etree.HTML(i)
movieList=source.xpath("//dd")
for movie in movieList:
movie_name=movie.xpath('.//p[@class="name"]/a/text()')[0]
movie_picture=movie.xpath('.//img[@class="board-img"]/@data-src')[0]
movie_address=movie.xpath('.//div[@class="movie-item-info"]/p["title"]/text()')[0].replace("\n","").replace(" ","")
movie_time=movie.xpath('.//div[@class="movie-item-info"]/p[3]/text()')[0]
score1=movie.xpath('.//div/p[@class="score"]/i[1]/text()')[0]
score2=movie.xpath('.//div/p[@class="score"]/i[2]/text()')[0]
movie_score=score1+score2
movieInfos.append([movie_name,movie_address,movie_picture,movie_time,movie_score])
print("--------源码解析完成!----------")
print("总爬取{}条!".format(len(movieInfos)))
def saveCsv(infos):
with open("猫眼.csv","w",encoding="utf-8",newline="") as f:
writer=csv.writer(f)
headers_Csv=["电影名称","主演","图片地址","上映时间","得分",]
writer.writerow(headers_Csv)
for info in infos:
writer.writerow(info)
print("------------文件写入完成!-----------")
def main():
movie_Url=getHtml()
i_xpath(movie_Url)
saveCsv(movieInfos)
if __name__ == '__main__':
main()
|