[开发测试] 利用Selenium爬取亚马逊商品review

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 开发测试 -> 利用Selenium爬取亚马逊商品review -> 正文阅读

[开发测试]利用Selenium爬取亚马逊商品review

参考网站：

How to Install Selenium WebDriver on MacOS? - GeeksforGeeks

在代码里面需要模拟点击按键，在此过程中你需要知道按钮的位置

???????

在写代码过程中需要定位我们想要的元素

属性定位
我们需要的是a标签中属性href的值，所以我们在a标签后面使用/@href获取href的属性值

最后生成的数据集格式

我已经把代码传了上去

GitHub - Huhaobangbangbang/base_catree_Text_Categorization: the NLP_projects in Bistuhttps://github.com/Huhaobangbangbang/base_catree_Text_Categorization.gituse_reptile_to_get_review.py

"""
 -*- coding: utf-8 -*-
 author： Hao Hu
 @date   2022/5/5 10:12 PM
"""
import os

import selenium.common.exceptions
from lxml import etree
import re
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from function import get_all_url,all_review_page,get_review_function
from function import get_new_link,save_data #这个函数是点击Next page得到下一个页面,save_data函数是将已有信息保存到json
from function import gethtml  # 为了得到静态页面HTML，有对页面反应超时的情况做了些延时处理
from tqdm import tqdm
from time import sleep
hea = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'downlink': '8',
    'ect': '4g',
    'rtt': '250',
    'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def change_address(postal):
    while True:
        try:
            driver.find_element_by_id('glow-ingress-line1').click()
            # driver.find_element_by_id('nav-global-location-slot').click()
            time.sleep(1)
        except Exception as e:
            driver.refresh()
            time.sleep(3)
            continue
        try:
            driver.find_element_by_id("GLUXChangePostalCodeLink").click()
            time.sleep(2)
        except:
            pass
        try:
            driver.find_element_by_id('GLUXZipUpdateInput').send_keys(postal)
            time.sleep(1)
            break
        except Exception as NoSuchElementException:
            try:
                driver.find_element_by_id('GLUXZipUpdateInput_0').send_keys(postal.split('-')[0])
                time.sleep(1)
                driver.find_element_by_id('GLUXZipUpdateInput_1').send_keys(postal.split('-')[1])
                time.sleep(1)
                break
            except Exception as NoSuchElementException:
                driver.refresh()
                time.sleep(3)
                continue
        print("重新选择地址")
    driver.find_element_by_id('GLUXZipUpdate').click()
    time.sleep(1)
    driver.refresh()


def initializate_options():
    """初始化"""
    # 启动并初始化Chrome
    options = webdriver.ChromeOptions()  # 初始化Chrome
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("disable-web-security")
    options.add_argument('disable-infobars')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    return options
options = initializate_options()


def get_price(html):
    """获得商品价格"""
    x_price = html.xpath('//span[@class="a-offscreen"]/text()')  # 价格
    try:
        product_price = x_price[0]
    except:
        product_price = '0'

    return product_price


def get_items(req):
    """使用Xpath解析页面，提取商品信息"""
    if (type(req) == str):
        html = etree.HTML(req)
    else:
        html = etree.HTML(req.text)
    #商品总体评分
    product_star = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrPopover"]/@title')[0]  # 星级
    print(product_star)
    product_rate0 = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrCustomerReviewText"]/text()')[0]  # 评论总数
    review_num = re.sub("\D", "", product_rate0)
    print('参与打分的总人数: ', review_num)
    # 商品的5点
    five_point_review = html.xpath('//div[@id="featurebullets_feature_div"]//ul//span[@class="a-list-item"]/text()')  # 五点描述

    return product_star,review_num,five_point_review


def get_review(url_path,review_num):
    """得到商品评价"""
    product_review = [] #所有页面的评论信息
    tmp_link = all_review_page(url_path)
    while(len(product_review)<int(review_num)):
        try:
            review_tmp = get_review_function(tmp_link)
            product_review += review_tmp
            tmp_link = get_new_link(tmp_link) # 翻到下一页
        except selenium.common.exceptions.NoSuchElementException:
            print('采集数据完成',len(product_review))
            break
    return product_review


def get_already_coped():
    files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database')
    already_coped = []
    for file in files:
        already_coped.append(file[:-5])
    return already_coped

if __name__ == '__main__':
    #启动并初始化Chrome
    url_list = get_all_url()

    already_coped_list = get_already_coped()
    for url in tqdm(url_list):
        try:
            if url[-10:] in already_coped_list:
                pass
            else:
                driver = webdriver.Chrome(chrome_options=options)
                wait = WebDriverWait(driver, 20)
                postal = "20237"  # 华盛顿
                print("正在爬取初始页面", url)
                driver.get(url)
                req, error = gethtml(url, hea)  # 默认header
                product_star,review_num,five_point_review = get_items(req)
                product_review = get_review(url,review_num)
                save_data(product_star, review_num, five_point_review, product_review, url)
                sleep(20)
                driver.quit()  # 关闭浏览器
        except:
            pass

function.py

"""
 -*- coding: utf-8 -*-
 author： Hao Hu
 @date   2022/5/5 10:13 PM
"""
from selenium import webdriver
from time import sleep
import time
from lxml import etree
from selenium.webdriver.common.by import By
import requests
import os, json
hea = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'max-age=0',
    'downlink': '8',
    'ect': '4g',
    'rtt': '250',
    'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}


def initializate_options():
    """初始化"""
    # 启动并初始化Chrome
    options = webdriver.ChromeOptions()  # 初始化Chrome
    options.add_argument('--no-sandbox')
    options.add_argument('--headless')
    options.add_argument('--disable-gpu')
    options.add_argument("disable-web-security")
    options.add_argument('disable-infobars')
    options.add_experimental_option('excludeSwitches', ['enable-automation'])

    return options

options = initializate_options()

def gethtml(url0, head):
    """为了得到静态页面HTML，有对页面反应超时的情况做了些延时处理"""
    i = 0
    while i < 5:
        try:
            html = requests.get(url=url0, headers=head, timeout=(10, 20))
            repeat = 0
            while (html.status_code != 200):  # 错误响应码重试
                print('error: ', html.status_code)
                time.sleep(20 + repeat * 5)
                repeat += 1
                html = requests.get(url=url0, headers=head, timeout=(10, 20))
                if (html.status_code != 200 and repeat == 2):
                    return html, repeat
            return html, repeat
        except requests.exceptions.RequestException:
            print('超时重试次数: ', i + 1)
            i += 1
    raise Exception()

def get_all_url():
    """得到所有商品页面链接"""
    url_before = 'https://www.amazon.com/dp/'
    files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/database')
    url_list = []
    for file in files:
        url_after = file[:-4]
        url_path = os.path.join(url_before,url_after)
        url_list.append(url_path)
    return url_list

def all_review_page(url_path):
    """翻到下一页"""
    """得到评论网页的链接"""

    html, repeat = gethtml(url_path, hea)
    html = etree.HTML(html.text)
    new_url = html.xpath('//a[@data-hook="see-all-reviews-link-foot"]/@href')
    new_url = 'https://www.amazon.com/' + new_url[0]
    return new_url

def get_new_link(old_url):
    """通过当前页面，点击Next Page得到下一个页面的新链接"""
    browser = webdriver.Chrome(chrome_options=options)
    browser.get(old_url)
    next_button = browser.find_element(By.XPATH, '//li[@class="a-last"]/a')
    next_button.click()
    new_url = browser.current_url
    return new_url

def save_data(product_star,review_num,five_point_review,product_review,url):
    """通过传过来的数据保存到json文件下"""
    ids = url[-10:]
    sample_dict = {}
    sample_dict['asin'] = ids
    sample_dict['stars'] = product_star
    sample_dict['review_num'] = review_num
    sample_dict['highlights'] = five_point_review
    sample_dict['reviews'] = product_review
    json_path = os.path.join('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database/',ids+'.json')
    out_file = open(json_path, "w")
    json.dump(sample_dict, out_file, indent=6)

def generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review,review_title,people_found_useful_information):
    """获得数据，输出字典形式的sample"""
    review_sample = {}#一个评价一个sample
    review_sample['author'] = buyer_id
    review_sample['stars'] = star_user
    review_sample['date'] = time_gived
    review_sample['is_verified_purchase'] = verified_information
    review_sample['size_product'] = size_product
    review_sample['colour_product'] = colour_product
    review_sample['people_found_useful_num'] = people_found_useful_information
    review_sample['review_title'] = review_title
    review_sample['review'] = review

    return review_sample

def get_review_function(url_link):
    """得到当前页面的评论"""
    html, _ = gethtml(url_link, hea)  # 默认header
    # ISO-8859-1
    html = etree.HTML(html.text)
    review_element = html.xpath('//div[@data-hook="review"]')
    # 商品购买信息
    review_sample_list = []
    # 商品购买信息
    products_information_list = html.xpath('//div[@class="a-section review aok-relative"]//a[@data-hook="format-strip"]/text()')

    for index in range(len(review_element)):
            sample = review_element[index]
            html_str = str(etree.tostring(sample))
            colour_product = ''
            size_product = ''
            try:
                buyer_id, star_user, time_gived, review_content, review_title = cope_string(html_str)
                current_str = html_str.split('<span data-hook="review-body"')[0]
                if 'Vine' in current_str:
                    verified_information = 'Vine Customer Review of Free Product'
                else:
                    verified_information = 'verified Purchase'
                try:
                    if len(products_information_list)>len(review_element):
                        size_product = products_information_list[index*2]
                        colour_product = products_information_list[index*2+1]
                    else:
                        colour_product = products_information_list[index]
                except IndexError:
                    pass
                people_found_useful_information = get_found_useful_information_num(html_str)
                review_sample = generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review_content,review_title,people_found_useful_information)
                review_sample_list.append(review_sample)
            except:
                pass
    return review_sample_list

def get_found_useful_information_num(html_str):
    """得到大家认为其有用信息的个数"""
    try:
        current_str = html_str.split('<div class="cr-helpful-button aok-float-left">')[0].split('cr-vote-text">')[1]
        people_found_useful_information =current_str.split(' &#')[0] + ' people found this helpful'
    except:
        people_found_useful_information = '0 people found this helpful'

    return people_found_useful_information


def cope_string(html_str):
    """处理网页中的字符串，过滤得到我们想要的东西"""
    # 用户的id
    account_str = html_str.split('/gp/profile/')[1]
    account = account_str.split('/')[0]
    # 用户给的🌟
    stars_buyer_gived_str = html_str.split('<span class="a-icon-alt">')[1]
    stars_buyer_gived = stars_buyer_gived_str.split('</span>')[0].split('&#')[0]
    # 接下来是用户评价的时间
    time_buyer_gived_str = html_str.split('review-date">')[1].split('</span><div class=')[0]
    review_content_str = html_str.split('data-hook="review-body"')[1]
    review_content = review_content_str.split('<span>')[1].split('</span>')[0]
    review_title_str = html_str.split('data-hook="review-title"')[1].split('<span data-hook="review-date"')[0]

    review_title = review_title_str.split('<span>')[1].split('</span>')[0]


    return account,stars_buyer_gived,time_buyer_gived_str,review_content,review_title



if __name__ == '__main__':
    options = initializate_options()
    url_path = 'https://www.amazon.com/dp/B0921T6QFC'
    url_path = 'https://www.amazon.com/Made4Pets-Featuring-Sisal-Covered-Scratching-Spacious/product-reviews/B081SH7JVJ/ref=cm_cr_arp_d_paging_btm_next_8?ie=UTF8&reviewerType=all_reviews&pageNumber=8'
    #url_path = 'https://www.amazon.com/Pawstory-Scratching-Multi-Level-Hammock-Furniture/product-reviews/B09FZ9ZV55/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3'
    get_review_function(url_path)