IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> 开发测试 -> 利用Selenium爬取亚马逊商品review -> 正文阅读

[开发测试]利用Selenium爬取亚马逊商品review

参考网站:

How to Install Selenium WebDriver on MacOS? - GeeksforGeeks

在代码里面需要模拟点击按键,在此过程中你需要知道按钮的位置

???????

在写代码过程中需要定位我们想要的元素

属性定位
我们需要的是a标签中属性href的值,所以我们在a标签后面使用/@href获取href的属性值

最后生成的数据集格式

?

我已经把代码传了上去

GitHub - Huhaobangbangbang/base_catree_Text_Categorization: the NLP_projects in Bistuicon-default.png?t=M3K6https://github.com/Huhaobangbangbang/base_catree_Text_Categorization.gituse_reptile_to_get_review.py

"""
 -*- coding: utf-8 -*-
 author: Hao Hu
 @date   2022/5/5 10:12 PM
"""
import os

import selenium.common.exceptions
from lxml import etree
import re
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from function import get_all_url,all_review_page,get_review_function
from function import get_new_link,save_data #这个函数是点击Next page得到下一个页面,save_data函数是将已有信息保存到json
from function import gethtml  # 为了得到静态页面HTML,有对页面反应超时的情况做了些延时处理
from tqdm import tqdm
from time import sleep
hea = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'downlink': '8',
    'ect': '4g',
    'rtt': '250',
    'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def change_address(postal):
    while True:
        try:
            driver.find_element_by_id('glow-ingress-line1').click()
            # driver.find_element_by_id('nav-global-location-slot').click()
            time.sleep(1)
        except Exception as e:
            driver.refresh()
            time.sleep(3)
            continue
        try:
            driver.find_element_by_id("GLUXChangePostalCodeLink").click()
            time.sleep(2)
        except:
            pass
        try:
            driver.find_element_by_id('GLUXZipUpdateInput').send_keys(postal)
            time.sleep(1)
            break
        except Exception as NoSuchElementException:
            try:
                driver.find_element_by_id('GLUXZipUpdateInput_0').send_keys(postal.split('-')[0])
                time.sleep(1)
                driver.find_element_by_id('GLUXZipUpdateInput_1').send_keys(postal.split('-')[1])
                time.sleep(1)
                break
            except Exception as NoSuchElementException:
                driver.refresh()
                time.sleep(3)
                continue
        print("重新选择地址")
    driver.find_element_by_id('GLUXZipUpdate').click()
    time.sleep(1)
    driver.refresh()


def initializate_options():
    """初始化"""
    # 启动并初始化Chrome
    options = webdriver.ChromeOptions()  # 初始化Chrome
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("disable-web-security")
    options.add_argument('disable-infobars')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    return options
options = initializate_options()


def get_price(html):
    """获得商品价格"""
    x_price = html.xpath('//span[@class="a-offscreen"]/text()')  # 价格
    try:
        product_price = x_price[0]
    except:
        product_price = '0'

    return product_price


def get_items(req):
    """使用Xpath解析页面,提取商品信息"""
    if (type(req) == str):
        html = etree.HTML(req)
    else:
        html = etree.HTML(req.text)
    #商品总体评分
    product_star = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrPopover"]/@title')[0]  # 星级
    print(product_star)
    product_rate0 = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrCustomerReviewText"]/text()')[0]  # 评论总数
    review_num = re.sub("\D", "", product_rate0)
    print('参与打分的总人数: ', review_num)
    # 商品的5点
    five_point_review = html.xpath('//div[@id="featurebullets_feature_div"]//ul//span[@class="a-list-item"]/text()')  # 五点描述

    return product_star,review_num,five_point_review


def get_review(url_path,review_num):
    """得到商品评价"""
    product_review = [] #所有页面的评论信息
    tmp_link = all_review_page(url_path)
    while(len(product_review)<int(review_num)):
        try:
            review_tmp = get_review_function(tmp_link)
            product_review += review_tmp
            tmp_link = get_new_link(tmp_link) # 翻到下一页
        except selenium.common.exceptions.NoSuchElementException:
            print('采集数据完成',len(product_review))
            break
    return product_review


def get_already_coped():
    files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database')
    already_coped = []
    for file in files:
        already_coped.append(file[:-5])
    return already_coped

if __name__ == '__main__':
    #启动并初始化Chrome
    url_list = get_all_url()

    already_coped_list = get_already_coped()
    for url in tqdm(url_list):
        try:
            if url[-10:] in already_coped_list:
                pass
            else:
                driver = webdriver.Chrome(chrome_options=options)
                wait = WebDriverWait(driver, 20)
                postal = "20237"  # 华盛顿
                print("正在爬取初始页面", url)
                driver.get(url)
                req, error = gethtml(url, hea)  # 默认header
                product_star,review_num,five_point_review = get_items(req)
                product_review = get_review(url,review_num)
                save_data(product_star, review_num, five_point_review, product_review, url)
                sleep(20)
                driver.quit()  # 关闭浏览器
        except:
            pass


function.py

"""
 -*- coding: utf-8 -*-
 author: Hao Hu
 @date   2022/5/5 10:13 PM
"""
from selenium import webdriver
from time import sleep
import time
from lxml import etree
from selenium.webdriver.common.by import By
import requests
import os, json
hea = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'downlink': '8',
    'ect': '4g',
    'rtt': '250',
    'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}


def initializate_options():
    """初始化"""
    # 启动并初始化Chrome
    options = webdriver.ChromeOptions()  # 初始化Chrome
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("disable-web-security")
    options.add_argument('disable-infobars')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    return options

options = initializate_options()

def gethtml(url0, head):
    """为了得到静态页面HTML,有对页面反应超时的情况做了些延时处理"""
    i = 0
    while i < 5:
        try:
            html = requests.get(url=url0, headers=head, timeout=(10, 20))
            repeat = 0
            while (html.status_code != 200):  # 错误响应码重试
                print('error: ', html.status_code)
                time.sleep(20 + repeat * 5)
                repeat += 1
                html = requests.get(url=url0, headers=head, timeout=(10, 20))
                if (html.status_code != 200 and repeat == 2):
                    return html, repeat
            return html, repeat
        except requests.exceptions.RequestException:
            print('超时重试次数: ', i + 1)
            i += 1
    raise Exception()

def get_all_url():
    """得到所有商品页面链接"""
    url_before = 'https://www.amazon.com/dp/'
    files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/database')
    url_list = []
    for file in files:
        url_after = file[:-4]
        url_path = os.path.join(url_before,url_after)
        url_list.append(url_path)
    return url_list

def all_review_page(url_path):
    """翻到下一页"""
    """得到评论网页的链接"""

    html, repeat = gethtml(url_path, hea)
    html = etree.HTML(html.text)
    new_url = html.xpath('//a[@data-hook="see-all-reviews-link-foot"]/@href')
    new_url = 'https://www.amazon.com/' + new_url[0]
    return new_url

def get_new_link(old_url):
    """通过当前页面,点击Next Page得到下一个页面的新链接"""
    browser = webdriver.Chrome(chrome_options=options)
    browser.get(old_url)
    next_button = browser.find_element(By.XPATH, '//li[@class="a-last"]/a')
    next_button.click()
    new_url = browser.current_url
    return new_url

def save_data(product_star,review_num,five_point_review,product_review,url):
    """通过传过来的数据保存到json文件下"""
    ids = url[-10:]
    sample_dict = {}
    sample_dict['asin'] = ids
    sample_dict['stars'] = product_star
    sample_dict['review_num'] = review_num
    sample_dict['highlights'] = five_point_review
    sample_dict['reviews'] = product_review
    json_path = os.path.join('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database/',ids+'.json')
    out_file = open(json_path, "w")
    json.dump(sample_dict, out_file, indent=6)

def generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review,review_title,people_found_useful_information):
    """获得数据,输出字典形式的sample"""
    review_sample = {}#一个评价一个sample
    review_sample['author'] = buyer_id
    review_sample['stars'] = star_user
    review_sample['date'] = time_gived
    review_sample['is_verified_purchase'] = verified_information
    review_sample['size_product'] = size_product
    review_sample['colour_product'] = colour_product
    review_sample['people_found_useful_num'] = people_found_useful_information
    review_sample['review_title'] = review_title
    review_sample['review'] = review

    return review_sample

def get_review_function(url_link):
    """得到当前页面的评论"""
    html, _ = gethtml(url_link, hea)  # 默认header
    # ISO-8859-1
    html = etree.HTML(html.text)
    review_element = html.xpath('//div[@data-hook="review"]')
    # 商品购买信息
    review_sample_list = []
    # 商品购买信息
    products_information_list = html.xpath('//div[@class="a-section review aok-relative"]//a[@data-hook="format-strip"]/text()')

    for index in range(len(review_element)):
            sample = review_element[index]
            html_str = str(etree.tostring(sample))
            colour_product = ''
            size_product = ''
            try:
                buyer_id, star_user, time_gived, review_content, review_title = cope_string(html_str)
                current_str = html_str.split('<span data-hook="review-body"')[0]
                if 'Vine' in current_str:
                    verified_information = 'Vine Customer Review of Free Product'
                else:
                    verified_information = 'verified Purchase'
                try:
                    if len(products_information_list)>len(review_element):
                        size_product = products_information_list[index*2]
                        colour_product = products_information_list[index*2+1]
                    else:
                        colour_product = products_information_list[index]
                except IndexError:
                    pass
                people_found_useful_information = get_found_useful_information_num(html_str)
                review_sample = generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review_content,review_title,people_found_useful_information)
                review_sample_list.append(review_sample)
            except:
                pass
    return review_sample_list

def get_found_useful_information_num(html_str):
    """得到大家认为其有用信息的个数"""
    try:
        current_str = html_str.split('<div class="cr-helpful-button aok-float-left">')[0].split('cr-vote-text">')[1]
        people_found_useful_information =current_str.split(' &#')[0] + ' people found this helpful'
    except:
        people_found_useful_information = '0 people found this helpful'

    return people_found_useful_information


def cope_string(html_str):
    """处理网页中的字符串,过滤得到我们想要的东西"""
    # 用户的id
    account_str = html_str.split('/gp/profile/')[1]
    account = account_str.split('/')[0]
    # 用户给的🌟
    stars_buyer_gived_str = html_str.split('<span class="a-icon-alt">')[1]
    stars_buyer_gived = stars_buyer_gived_str.split('</span>')[0].split('&#')[0]
    # 接下来是用户评价的时间
    time_buyer_gived_str = html_str.split('review-date">')[1].split('</span><div class=')[0]
    review_content_str = html_str.split('data-hook="review-body"')[1]
    review_content = review_content_str.split('<span>')[1].split('</span>')[0]
    review_title_str = html_str.split('data-hook="review-title"')[1].split('<span data-hook="review-date"')[0]

    review_title = review_title_str.split('<span>')[1].split('</span>')[0]


    return account,stars_buyer_gived,time_buyer_gived_str,review_content,review_title



if __name__ == '__main__':
    options = initializate_options()
    url_path = 'https://www.amazon.com/dp/B0921T6QFC'
    url_path = 'https://www.amazon.com/Made4Pets-Featuring-Sisal-Covered-Scratching-Spacious/product-reviews/B081SH7JVJ/ref=cm_cr_arp_d_paging_btm_next_8?ie=UTF8&reviewerType=all_reviews&pageNumber=8'
    #url_path = 'https://www.amazon.com/Pawstory-Scratching-Multi-Level-Hammock-Furniture/product-reviews/B09FZ9ZV55/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3'
    get_review_function(url_path)

  开发测试 最新文章
pytest系列——allure之生成测试报告(Wind
某大厂软件测试岗一面笔试题+二面问答题面试
iperf 学习笔记
关于Python中使用selenium八大定位方法
【软件测试】为什么提升不了?8年测试总结再
软件测试复习
PHP笔记-Smarty模板引擎的使用
C++Test使用入门
【Java】单元测试
Net core 3.x 获取客户端地址
上一篇文章      下一篇文章      查看所有文章
加:2022-05-12 16:40:57  更:2022-05-12 16:40:59 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/17 22:32:44-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码