IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> Python知识库 -> Python 豆瓣电影 Top 250 xpath,beautiful soup,pyquery -> 正文阅读

[Python知识库]Python 豆瓣电影 Top 250 xpath,beautiful soup,pyquery

作者:recommend-item-box type_blog clearfix

xpath:


import requests
import time
import csv
from requests import RequestException
from lxml import etree


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return '请求失败'
    except RequestException:
        return '爬取错误'


def parse():
    moviesList = []
    for i in range(10):
        url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
        response = get_one_page(url)

        html = etree.HTML(response)
        li = html.xpath('//ol[@class="grid_view"]/li')
        for getresult in li:

            index = getresult.xpath('./div/div/em/text()')[0]  # 电影排名
            name = getresult.xpath('./div/div[2]/div[1]/a[1]/span[1]/text()')[0]  # 电影名
            # movies_director_starring = getresult.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip()  # 导演和主演
            director_actor = getresult.xpath('./div/div[2]/div[2]/p[1]/text()')[0].strip('\n').strip('\xa0')
            director = director_actor[:director_actor.find('主演')].strip()  # 导演
            actor = director_actor[director_actor.find('主演'):].strip()  # 主演

            movies_release_date = getresult.xpath('./div/div[2]/div[2]/p/text()')[1].strip()[0:4]  # 发布日期
            movies_score = getresult.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0]  # 电影评分
            movies_score_num = getresult.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0]  # 电影评分人数
            movies_introduce = getresult.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0]  # 电影简介

            moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce])
        time.sleep(2)

    return moviesList

def write_to_file(moviesList):
    with open('xpath_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
        writer.writerow(fieldnames)
        for i in moviesList:
            writer.writerow(i)


if __name__ == '__main__':
    moviesList = parse()
    write_to_file(moviesList)

beautiful soup:


import requests
import time
import csv
from requests.exceptions import RequestException
from bs4 import BeautifulSoup


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return '请求失败'
    except RequestException:
        return '爬取错误'

def parse():
    moviesList = []
    for i in range(10):
        url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
        response = get_one_page(url)

        soup = BeautifulSoup(response, 'lxml')
        li = soup.select('ol li')
        for getresult in li:

            index = getresult.find('div', class_='pic').find('em').get_text()  # 排名
            name = getresult.find('span', class_='title').get_text()  # 名称
            director_actor = getresult.find(class_='bd').p.get_text().strip().split('\n')

            actor_infos1 = director_actor[0].split('\xa0\xa0\xa0')
            movie_director = actor_infos1[0]     #导演
            movie_role = actor_infos1[1] if len(actor_infos1)>1 else ""  # 主演

            movies_release_date = director_actor[1].strip().split('\xa0/\xa0')[0]  # 发布日期
            movies_score = getresult.find(class_='rating_num').get_text()  # 评分
            movies_score_num = getresult.find(class_='star').get_text().strip().split('\n')[2]
            movies_introduces = getresult.find(class_='inq')
            movies_introduce = movies_introduces.get_text() if movies_introduces else ''

            moviesList.append([index, name, movie_director, movie_role, movies_release_date, movies_score, movies_score_num, movies_introduce])
        time.sleep(2)

    return moviesList

def write_to_file(moviesList):  
    with open('bs4_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
        writer.writerow(fieldnames)
        for i in moviesList:
            writer.writerow(i)


if __name__ == '__main__':
    moviesList = parse()
    write_to_file(moviesList)

pyquery:


import requests
import time
import csv
from requests import RequestException
from pyquery import PyQuery as pq


def get_one_page(url):
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
            'Cookie': 'bid=bq6bl4kmTuw; douban-fav-remind=1; __gads=ID=591697b17f1fe23a-220f8b8865cf0092:T=1639058234:RT=1639058234:S=ALNI_MZO_48BdnjRSkE5opDvE-zAaE9tAA; ll="108288"; __utmz=223695111.1639215564.2.2.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _vwo_uuid_v2=D23ED6BBB550E9DB6F96FC7952ABEC279|a138b60c8a7b45c62db689ccc572217f; push_noty_num=0; push_doumail_num=0; ap_v=0,6.0; __utmz=30149280.1639231925.6.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmv=30149280.25152; dbcl2="251525456:JrZGQt48U80"; ck=heCZ; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1639235024%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utmb=30149280.0.10.1639235088; __utmc=30149280; __utma=30149280.635194538.1639058235.1639231925.1639235088.7; __utma=223695111.1210278848.1639058235.1639226759.1639235088.6; __utmb=223695111.0.10.1639235088; __utmc=223695111; _pk_id.100001.4cf6=79142103d75fe9fc.1639058235.6.1639235410.1639226759.'
        }
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        return '请求失败'
    except RequestException:
        return '爬取错误'


def parse():
    moviesList = []
    for i in range(10):
        url = 'https://movie.douban.com/top250?start=' + str(25 * i) + '&filter='
        response = get_one_page(url)

        html = pq(response)
        li = html('div.item').items()
        for getresult in li:

            index = getresult.find('.pic em').text()  # 电影排名
            name = getresult.find('.hd span.title').text().split('/')[0]  # 电影名

            director_actor = getresult.find('div.bd p:eq(0)').text().split('\n')[0]
            director = director_actor[:director_actor.find('主演')].strip()  # 导演
            actor = director_actor[director_actor.find('主演'):].strip()  # 主演

            movies_release_date = getresult.find('div.bd p:eq(0)').text().split('\n')[1].strip()[0:4]  # 发布日期
            movies_score = getresult.find('span.rating_num').text()  # 电影评分
            movies_score_num = getresult.find('.star span').text().strip().split()[1]  # 电影评分人数
            movies_introduce = getresult.find('span.inq').text()  # 电影简介

            moviesList.append([index, name, director, actor, movies_release_date, movies_score, movies_score_num, movies_introduce])
        time.sleep(2)

    return moviesList

def write_to_file(moviesList):
    with open('pyquery_result.csv', 'w', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        fieldnames = ['电影排名', '电影名称', '导演', '主演', '发布日期', '评分', '评分人数', '简介']
        writer.writerow(fieldnames)
        for i in moviesList:
            writer.writerow(i)


if __name__ == '__main__':
    moviesList = parse()
    write_to_file(moviesList)

部分结果截图

全部结果CSV在 我的CSDN的资源中下载

  Python知识库 最新文章
Python中String模块
【Python】 14-CVS文件操作
python的panda库读写文件
使用Nordic的nrf52840实现蓝牙DFU过程
【Python学习记录】numpy数组用法整理
Python学习笔记
python字符串和列表
python如何从txt文件中解析出有效的数据
Python编程从入门到实践自学/3.1-3.2
python变量
上一篇文章      下一篇文章      查看所有文章
加:2021-12-13 12:46:15  更:2021-12-13 12:47:42 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/16 4:39:16-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码