参考网站:
How to Install Selenium WebDriver on MacOS? - GeeksforGeeks
在代码里面需要模拟点击按键,在此过程中你需要知道按钮的位置
???????
在写代码过程中需要定位我们想要的元素
属性定位 我们需要的是a标签中属性href的值,所以我们在a标签后面使用/@href获取href的属性值
最后生成的数据集格式
?
我已经把代码传了上去
GitHub - Huhaobangbangbang/base_catree_Text_Categorization: the NLP_projects in Bistuhttps://github.com/Huhaobangbangbang/base_catree_Text_Categorization.gituse_reptile_to_get_review.py
"""
-*- coding: utf-8 -*-
author: Hao Hu
@date 2022/5/5 10:12 PM
"""
import os
import selenium.common.exceptions
from lxml import etree
import re
import time
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from function import get_all_url,all_review_page,get_review_function
from function import get_new_link,save_data #这个函数是点击Next page得到下一个页面,save_data函数是将已有信息保存到json
from function import gethtml # 为了得到静态页面HTML,有对页面反应超时的情况做了些延时处理
from tqdm import tqdm
from time import sleep
hea = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'downlink': '8',
'ect': '4g',
'rtt': '250',
'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def change_address(postal):
while True:
try:
driver.find_element_by_id('glow-ingress-line1').click()
# driver.find_element_by_id('nav-global-location-slot').click()
time.sleep(1)
except Exception as e:
driver.refresh()
time.sleep(3)
continue
try:
driver.find_element_by_id("GLUXChangePostalCodeLink").click()
time.sleep(2)
except:
pass
try:
driver.find_element_by_id('GLUXZipUpdateInput').send_keys(postal)
time.sleep(1)
break
except Exception as NoSuchElementException:
try:
driver.find_element_by_id('GLUXZipUpdateInput_0').send_keys(postal.split('-')[0])
time.sleep(1)
driver.find_element_by_id('GLUXZipUpdateInput_1').send_keys(postal.split('-')[1])
time.sleep(1)
break
except Exception as NoSuchElementException:
driver.refresh()
time.sleep(3)
continue
print("重新选择地址")
driver.find_element_by_id('GLUXZipUpdate').click()
time.sleep(1)
driver.refresh()
def initializate_options():
"""初始化"""
# 启动并初始化Chrome
options = webdriver.ChromeOptions() # 初始化Chrome
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument("disable-web-security")
options.add_argument('disable-infobars')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
return options
options = initializate_options()
def get_price(html):
"""获得商品价格"""
x_price = html.xpath('//span[@class="a-offscreen"]/text()') # 价格
try:
product_price = x_price[0]
except:
product_price = '0'
return product_price
def get_items(req):
"""使用Xpath解析页面,提取商品信息"""
if (type(req) == str):
html = etree.HTML(req)
else:
html = etree.HTML(req.text)
#商品总体评分
product_star = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrPopover"]/@title')[0] # 星级
print(product_star)
product_rate0 = html.xpath('//div[@id="averageCustomerReviews_feature_div"]//span[@id="acrCustomerReviewText"]/text()')[0] # 评论总数
review_num = re.sub("\D", "", product_rate0)
print('参与打分的总人数: ', review_num)
# 商品的5点
five_point_review = html.xpath('//div[@id="featurebullets_feature_div"]//ul//span[@class="a-list-item"]/text()') # 五点描述
return product_star,review_num,five_point_review
def get_review(url_path,review_num):
"""得到商品评价"""
product_review = [] #所有页面的评论信息
tmp_link = all_review_page(url_path)
while(len(product_review)<int(review_num)):
try:
review_tmp = get_review_function(tmp_link)
product_review += review_tmp
tmp_link = get_new_link(tmp_link) # 翻到下一页
except selenium.common.exceptions.NoSuchElementException:
print('采集数据完成',len(product_review))
break
return product_review
def get_already_coped():
files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database')
already_coped = []
for file in files:
already_coped.append(file[:-5])
return already_coped
if __name__ == '__main__':
#启动并初始化Chrome
url_list = get_all_url()
already_coped_list = get_already_coped()
for url in tqdm(url_list):
try:
if url[-10:] in already_coped_list:
pass
else:
driver = webdriver.Chrome(chrome_options=options)
wait = WebDriverWait(driver, 20)
postal = "20237" # 华盛顿
print("正在爬取初始页面", url)
driver.get(url)
req, error = gethtml(url, hea) # 默认header
product_star,review_num,five_point_review = get_items(req)
product_review = get_review(url,review_num)
save_data(product_star, review_num, five_point_review, product_review, url)
sleep(20)
driver.quit() # 关闭浏览器
except:
pass
function.py
"""
-*- coding: utf-8 -*-
author: Hao Hu
@date 2022/5/5 10:13 PM
"""
from selenium import webdriver
from time import sleep
import time
from lxml import etree
from selenium.webdriver.common.by import By
import requests
import os, json
hea = {
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
'cache-control': 'max-age=0',
'downlink': '8',
'ect': '4g',
'rtt': '250',
'Cookie': "session-id=257-3500989-3695223; i18n-prefs=GBP; ubid-acbuk=257-5950834-2508848; x-wl-uid=1bEcLG2b03/1tAwPJNyfuRH+U7J9ZaPYejSBR4HXKuYQPJtLhQbDYyO/GOMypGKXqZrG7qBkS0ng=; session-token=x04EF8doE84tE+6CXYubsjmyob/3M6fdmsQuqzD0jwl/qGdO5aRc2eyhGiwoD0TFzK1rR/yziHsDS4v6cdqT2DySFXFZ9I5OHEtgufqBMEyrA0/Scr87KKA+GWOjfVmKRuPCqOGaixZQ6AIjU3e2iFOdM+3v90NeXFI3cazZcd6x9TYCy9b5u9V8zR7ePbdP; session-id-time=2082758401l; csm-hit=tb:MAA188S1G57TNTH6HQCZ+s-T9EGT4C8FC8J74X5T7CY|1594212767446&t:1594212767446&adb:adblk_no",
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
def initializate_options():
"""初始化"""
# 启动并初始化Chrome
options = webdriver.ChromeOptions() # 初始化Chrome
options.add_argument('--no-sandbox')
options.add_argument('--headless')
options.add_argument('--disable-gpu')
options.add_argument("disable-web-security")
options.add_argument('disable-infobars')
options.add_experimental_option('excludeSwitches', ['enable-automation'])
return options
options = initializate_options()
def gethtml(url0, head):
"""为了得到静态页面HTML,有对页面反应超时的情况做了些延时处理"""
i = 0
while i < 5:
try:
html = requests.get(url=url0, headers=head, timeout=(10, 20))
repeat = 0
while (html.status_code != 200): # 错误响应码重试
print('error: ', html.status_code)
time.sleep(20 + repeat * 5)
repeat += 1
html = requests.get(url=url0, headers=head, timeout=(10, 20))
if (html.status_code != 200 and repeat == 2):
return html, repeat
return html, repeat
except requests.exceptions.RequestException:
print('超时重试次数: ', i + 1)
i += 1
raise Exception()
def get_all_url():
"""得到所有商品页面链接"""
url_before = 'https://www.amazon.com/dp/'
files = os.listdir('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/database')
url_list = []
for file in files:
url_after = file[:-4]
url_path = os.path.join(url_before,url_after)
url_list.append(url_path)
return url_list
def all_review_page(url_path):
"""翻到下一页"""
"""得到评论网页的链接"""
html, repeat = gethtml(url_path, hea)
html = etree.HTML(html.text)
new_url = html.xpath('//a[@data-hook="see-all-reviews-link-foot"]/@href')
new_url = 'https://www.amazon.com/' + new_url[0]
return new_url
def get_new_link(old_url):
"""通过当前页面,点击Next Page得到下一个页面的新链接"""
browser = webdriver.Chrome(chrome_options=options)
browser.get(old_url)
next_button = browser.find_element(By.XPATH, '//li[@class="a-last"]/a')
next_button.click()
new_url = browser.current_url
return new_url
def save_data(product_star,review_num,five_point_review,product_review,url):
"""通过传过来的数据保存到json文件下"""
ids = url[-10:]
sample_dict = {}
sample_dict['asin'] = ids
sample_dict['stars'] = product_star
sample_dict['review_num'] = review_num
sample_dict['highlights'] = five_point_review
sample_dict['reviews'] = product_review
json_path = os.path.join('/Users/huhao/Documents/GitHub/base_catree_Text_Categorization/review_database/',ids+'.json')
out_file = open(json_path, "w")
json.dump(sample_dict, out_file, indent=6)
def generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review,review_title,people_found_useful_information):
"""获得数据,输出字典形式的sample"""
review_sample = {}#一个评价一个sample
review_sample['author'] = buyer_id
review_sample['stars'] = star_user
review_sample['date'] = time_gived
review_sample['is_verified_purchase'] = verified_information
review_sample['size_product'] = size_product
review_sample['colour_product'] = colour_product
review_sample['people_found_useful_num'] = people_found_useful_information
review_sample['review_title'] = review_title
review_sample['review'] = review
return review_sample
def get_review_function(url_link):
"""得到当前页面的评论"""
html, _ = gethtml(url_link, hea) # 默认header
# ISO-8859-1
html = etree.HTML(html.text)
review_element = html.xpath('//div[@data-hook="review"]')
# 商品购买信息
review_sample_list = []
# 商品购买信息
products_information_list = html.xpath('//div[@class="a-section review aok-relative"]//a[@data-hook="format-strip"]/text()')
for index in range(len(review_element)):
sample = review_element[index]
html_str = str(etree.tostring(sample))
colour_product = ''
size_product = ''
try:
buyer_id, star_user, time_gived, review_content, review_title = cope_string(html_str)
current_str = html_str.split('<span data-hook="review-body"')[0]
if 'Vine' in current_str:
verified_information = 'Vine Customer Review of Free Product'
else:
verified_information = 'verified Purchase'
try:
if len(products_information_list)>len(review_element):
size_product = products_information_list[index*2]
colour_product = products_information_list[index*2+1]
else:
colour_product = products_information_list[index]
except IndexError:
pass
people_found_useful_information = get_found_useful_information_num(html_str)
review_sample = generate_sample(buyer_id,star_user,time_gived,size_product,colour_product,verified_information,review_content,review_title,people_found_useful_information)
review_sample_list.append(review_sample)
except:
pass
return review_sample_list
def get_found_useful_information_num(html_str):
"""得到大家认为其有用信息的个数"""
try:
current_str = html_str.split('<div class="cr-helpful-button aok-float-left">')[0].split('cr-vote-text">')[1]
people_found_useful_information =current_str.split(' &#')[0] + ' people found this helpful'
except:
people_found_useful_information = '0 people found this helpful'
return people_found_useful_information
def cope_string(html_str):
"""处理网页中的字符串,过滤得到我们想要的东西"""
# 用户的id
account_str = html_str.split('/gp/profile/')[1]
account = account_str.split('/')[0]
# 用户给的🌟
stars_buyer_gived_str = html_str.split('<span class="a-icon-alt">')[1]
stars_buyer_gived = stars_buyer_gived_str.split('</span>')[0].split('&#')[0]
# 接下来是用户评价的时间
time_buyer_gived_str = html_str.split('review-date">')[1].split('</span><div class=')[0]
review_content_str = html_str.split('data-hook="review-body"')[1]
review_content = review_content_str.split('<span>')[1].split('</span>')[0]
review_title_str = html_str.split('data-hook="review-title"')[1].split('<span data-hook="review-date"')[0]
review_title = review_title_str.split('<span>')[1].split('</span>')[0]
return account,stars_buyer_gived,time_buyer_gived_str,review_content,review_title
if __name__ == '__main__':
options = initializate_options()
url_path = 'https://www.amazon.com/dp/B0921T6QFC'
url_path = 'https://www.amazon.com/Made4Pets-Featuring-Sisal-Covered-Scratching-Spacious/product-reviews/B081SH7JVJ/ref=cm_cr_arp_d_paging_btm_next_8?ie=UTF8&reviewerType=all_reviews&pageNumber=8'
#url_path = 'https://www.amazon.com/Pawstory-Scratching-Multi-Level-Hammock-Furniture/product-reviews/B09FZ9ZV55/ref=cm_cr_arp_d_paging_btm_next_3?ie=UTF8&reviewerType=all_reviews&pageNumber=3'
get_review_function(url_path)
|