创建包 :sqProcess sqlite数据库加工
创建文件: sqProcessDataAcquisition 数据爬取
— 网络爬虫,数据爬取
"""
-*- codeing = utf-8 -*-
@ProojectNaem : Automation
@File : sqProcessDataAcquisition.py
@DataTime : 2022-04-27 21:06
@Author : ChairDu
@Email : chair7@163.com
@Descriptioon : 数据爬取,爬取电影信息
"""
import os, sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
import requests
from bs4 import BeautifulSoup as bs4tl
url = "https://ssr1.scrape.center"
def analytical_url(url,port=''):
"""get请求html文本转换BeautifulSoup格式"""
_url = "{0}{1}".format(url, port)
header = {"Content-Type": "text/html; charset=utf-8",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"}
respone = requests.request("GET",url = _url,headers=header)
respone.encoding = "utf-8"
responeSoup = bs4tl(respone.text, "html.parser")
return responeSoup
def reptilePageLabel():
try:
reptilePageLabel = [responeSoup.find('a').get("href") for responeSoup in analytical_url(url).find_all('li',attrs={"class": "number"})]
return reptilePageLabel
except Exception as error:
return error
def reptileDetailLabel():
x =[]
for port in reptilePageLabel():
responeSoups = analytical_url(url=url,port=port).find_all("div",attrs={"class":"el-col el-col-18 el-col-offset-3"})
for responeSoup in responeSoups:
responeSoup = responeSoup.find_all("a",attrs={"class": ""})
x.extend([responeSoup.get("href") for responeSoup in responeSoup])
return x
def sqProcessDataAcquisition():
for port in reptileDetailLabel():
responeSoups = analytical_url(url=url,port=port)
""" 电影图片,电影名字,电影规格,电影类型,电影地区,电影时长,上映时间,电影详情,电影评分 """
film_pthot,film_name,film_specification,film_type,film_regio,file_durtion,release_date,film_detaile,film_rating = '','','','','','','','',''
""" 电影标签,导演信息,演员信息 """
film_tables,director_messages,actor_messages = [],[],[]
try:
film_name = responeSoups.find("h2", attrs={"class": "m-b-sm"}).text
print(film_name)
except:
continue
try:
film_pthot = responeSoups.find("img",attrs={"class":"cover"}).get("src")
film_tables = [tables.text for tables in responeSoups.find_all("button",attrs={"class":"el-button category el-button--primary el-button--mini"})]
film_specification = film_tables[0]
film_type = film_tables[1:]
film_regio = responeSoups.find("div",attrs={"class":"m-v-sm info"}).find("span").text
file_durtion = responeSoups.find("div", attrs={"class": "m-v-sm info"}).find_all("span")[2].text
release_date = responeSoups.find_all("div", attrs={"class": "m-v-sm info"})[1].find("span").text
film_detaile = responeSoups.find("div", attrs={"class": "drama"}).find("p").text.strip()
film_rating = responeSoups.find("p", attrs={"class": "score m-t-md m-b-n-sm"}).text.strip()
except:
pass
director_messages = responeSoups.find_all("div", attrs={"class": "director el-col el-col-4"})
for director_message in director_messages:
""" 导演名字,导演照片 """
director_name,director_photo = '',''
try:
director_name = director_message.find("p").text
director_photo = director_message.find("img").get("src")
except:
pass
actor_messages = responeSoups.find_all("div", attrs={"class": "actor el-col el-col-4"})
for actor_message in actor_messages:
""" 演员图片,演员名字,饰演角色 """
actor_photo,actor_name,play_the_part_of = '','',''
actor = actor_message.find("div",attrs={"class": "el-card__body"})
try:
actor_photo = actor.find("img").get("src")
actor_name = actor.find_all("p")[0].text
play_the_part_of = actor.find_all("p")[-1].text
except:
pass
print("-----------")
break
if __name__ == '__main__':
sqProcessDataAcquisition()
— 网络爬虫,数据写入数据库
在这里插入代码片
|