[系统运维] 自动化测试工程师摸索之路---sq数据加工(网络爬虫)

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 系统运维 -> 自动化测试工程师摸索之路---sq数据加工(网络爬虫) -> 正文阅读

[系统运维]自动化测试工程师摸索之路---sq数据加工(网络爬虫)

创建包 :sqProcess sqlite数据库加工

创建文件: sqProcessDataAcquisition 数据爬取

— 网络爬虫，数据爬取

"""
 -*- codeing = utf-8 -*-
 @ProojectNaem : Automation
 @File : sqProcessDataAcquisition.py
 @DataTime : 2022-04-27 21:06
 @Author : ChairDu
 @Email : chair7@163.com
 @Descriptioon : 数据爬取,爬取电影信息
 
"""

import os, sys
sys.path.append(os.path.dirname(os.path.dirname(__file__)))
import requests
from bs4 import BeautifulSoup as bs4tl

url = "https://ssr1.scrape.center"
def analytical_url(url,port=''):
    """get请求html文本转换BeautifulSoup格式"""
    _url = "{0}{1}".format(url, port)   # 对应域名+接口路径，生成完整url
    header = {"Content-Type": "text/html; charset=utf-8",
              "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.60 Safari/537.36"}
    respone = requests.request("GET",url = _url,headers=header)
    respone.encoding = "utf-8"
    responeSoup = bs4tl(respone.text, "html.parser")
    return responeSoup
def reptilePageLabel(): # 获取网页页码对应的接口名
    try:
        reptilePageLabel = [responeSoup.find('a').get("href") for responeSoup in analytical_url(url).find_all('li',attrs={"class": "number"})]
        return reptilePageLabel
    except Exception as error:
        return error
def reptileDetailLabel(): # 获取每个影视详情页接口
    x =[]
    for port in reptilePageLabel():
        responeSoups = analytical_url(url=url,port=port).find_all("div",attrs={"class":"el-col el-col-18 el-col-offset-3"})
        for responeSoup in responeSoups:
            responeSoup = responeSoup.find_all("a",attrs={"class": ""})
            x.extend([responeSoup.get("href") for responeSoup in responeSoup])
    return x
def sqProcessDataAcquisition(): # 获取每个影视详情
    for port in  reptileDetailLabel():
    # for port in ['/detail/7']:
        responeSoups = analytical_url(url=url,port=port)
        """  电影图片,电影名字,电影规格,电影类型,电影地区,电影时长,上映时间,电影详情,电影评分 """
        film_pthot,film_name,film_specification,film_type,film_regio,file_durtion,release_date,film_detaile,film_rating = '','','','','','','','',''
        """  电影标签，导演信息，演员信息 """
        film_tables,director_messages,actor_messages = [],[],[]
        try:
            film_name = responeSoups.find("h2", attrs={"class": "m-b-sm"}).text  # 电影名字
            print(film_name)
        except:
            continue
        try:
            film_pthot = responeSoups.find("img",attrs={"class":"cover"}).get("src") # 电影图片
            film_tables = [tables.text for tables in responeSoups.find_all("button",attrs={"class":"el-button category el-button--primary el-button--mini"})]
            film_specification = film_tables[0] # 电影规格
            film_type = film_tables[1:]    # 电影类型
            film_regio = responeSoups.find("div",attrs={"class":"m-v-sm info"}).find("span").text # 电影地区
            file_durtion = responeSoups.find("div", attrs={"class": "m-v-sm info"}).find_all("span")[2].text  # 电影时长
            release_date = responeSoups.find_all("div", attrs={"class": "m-v-sm info"})[1].find("span").text   # 上映时间
            film_detaile = responeSoups.find("div", attrs={"class": "drama"}).find("p").text.strip()   # 电影详情
            film_rating = responeSoups.find("p", attrs={"class": "score m-t-md m-b-n-sm"}).text.strip()  # 电影评分
        except:
            pass
        director_messages = responeSoups.find_all("div", attrs={"class": "director el-col el-col-4"})    # 导演信息
        for director_message in director_messages:
            """  导演名字，导演照片  """
            director_name,director_photo = '',''
            try:
                director_name = director_message.find("p").text
                director_photo = director_message.find("img").get("src")
            except:
                pass
        actor_messages =  responeSoups.find_all("div", attrs={"class": "actor el-col el-col-4"})  # 演员信息
        for actor_message in actor_messages:
            """  演员图片,演员名字,饰演角色 """
            actor_photo,actor_name,play_the_part_of = '','',''  #
            actor = actor_message.find("div",attrs={"class": "el-card__body"})
            try:
                actor_photo = actor.find("img").get("src")
                actor_name = actor.find_all("p")[0].text
                play_the_part_of = actor.find_all("p")[-1].text
            except:
                pass
        print("-----------")
        break



if __name__ == '__main__':
    sqProcessDataAcquisition()