在这里只是一个示例,教我自己总结的方法
获取cookie
首先,自动化肯定是要获取cookie用来登录账号的,代码如下:
这里需要自己点几下,登录自己的账号,然后获取cookie信息,保存到文件中,然后后面单独使用
主要知识点
browers = webdriver.Chrome()
browers.get(url)
browers.close()
list = browers.get_cookies()
with open('../data/cookies.json', 'w', encoding='utf-8') as file:
json.dump(list, file, indent=2, ensure_ascii=False)
源代码
import json
import time
from selenium import webdriver
browers = webdriver.Chrome()
url = 'https://bj.meituan.com'
browers.get(url)
time.sleep(30)
list = browers.get_cookies()
print(list)
with open('../data/cookies.json', 'w', encoding='utf-8') as file:
json.dump(list, file, indent=2, ensure_ascii=False)
browers.close()
爬取信息
主要知识点
文件操作
xlsx文件系列
book = openpyxl.load_workbook('../景区信息.xlsx')
sheet1 = book['Sheet1']
len_row = sheet1.max_row
sheet1.cell(num, 8).value
book.save(filename="../景区信息.xlsx")
csv文件系列
with open(path, 'a', encoding='utf-8-sig') as csvfile:
file = csv.writer(csvfile)
if flag == True:
file.writerow(['评论时间','评分','评论内容'])
file.writerows([['4','q',21],['5','q',21]])
with open('data.csv','w',encoding='utf-8') as csvfile:
filedname = ['id','name','age']
file = csv.DictWriter(csvfile,fieldnames=filedname)
file.writeheader()
file.writerow({'id' : 'Q','name' : 'F','age' : 'Y'})
file.writerow({'id' : '1','name' : 21,'age' : '64'})
with open('data.csv','r',encoding='utf-8') as file:
read = csv.reader(file)
print(read)
for item in read:
print(item)
df = pandas.read_csv('data.csv')
print(df)
xls文件系列
workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet1', cell_overwrite_ok=True)
booksheet.write(0, 0, '1')
workbook.save('../data/景区评论/' + location_city + '/' + location_name + '.xls')
html节点获取
这里我是直接用的xpath文法规则来获取的 由于网络的波动,所有有不同的获取方式 这里的节点就是html中信息存储的位置,爬取这些位置的文字信息,然后利用正则提取等方法去规范格式
img_locations = brower.find_elements_by_xpath('//*[@class="common-list-main"]//*[@class="default-list-item clearfix"]/a/img')
brower.implicitly_wait(1)
click_locations = WebDriverWait(brower, max_wait_time).until(EC.presence_of_all_elements_located(
(By.XPATH, '//*[@class="common-list-main"]//*[@class="default-list-item clearfix"]/a')))
正则提取
idea_tag = '(.*?)|.*?'
idea_evaluation = '(.*?)分(.*?)人.*?'
result_tag = re.findall(idea_tag, List[3], re.S)
result_evaluation = re.findall(idea_evaluation, List[2], re.S)
其他知识
name = '123'
os.mkdir(name)
os.makedirs(name)
path = os.path.dirname(os.path.abspath(__file__))
path_1 = os.path.exists()
urlretrieve(img_url, '../data/景区图片/' + location_name + '.jpg')
now_url = brower.current_url
brower.execute_script('window.scrollTo(0,document.body.scrollHeight)')
img_url = item.get_attribute('src')
item.click()
item.text
search.send_keys(scenic_name)
异常处理模块
login
由于网络的不稳定性,以及高频的爬取,爬取过程中会出现各种情况,当出现这种情况后,调用login模块,记录一下,出现问题的景区,然后关闭浏览器一会儿,重新打开再次登录。并且记录下丢失的景区。
def login(location_original_name):
global brower
handles = brower.window_handles
brower_len = len(handles)
for i in range(0,brower_len):
brower.switch_to.window(brower.window_handles[i])
brower.close()
time.sleep(10)
brower = webdriver.Chrome()
lost_list.append(location_original_name)
print(lost_list)
add_cookies()
except_solve
当出现异常的时候,可能存在多个网页,这个时候你必须得关闭无用的网页,否则,当多次异常后,你的浏览器可能会存在100多个窗口,会把你的电脑内存占爆的
def except_solve():
handles = brower.window_handles
brower_len = len(handles)
for i in range(1,brower_len):
brower.switch_to.window(brower.window_handles[i])
brower.close()
brower.switch_to.window(brower.window_handles[0])
保存信息
保存图片
def save_img(img_url, location_name):
urlretrieve(img_url, '../data/景区图片/' + location_name + '.jpg')
保存景区基本信息
def save_basis_information(map):
for num in range(2, len_row):
location_original_name = sheet1.cell(num, 1).value
if location_original_name == map['location_original_name']:
sheet1.cell(num, 8).value = map['location_name']
sheet1.cell(num, 9).value = map['location_tag']
sheet1.cell(num, 10).value = map['location_score']
sheet1.cell(num, 11).value = map['location_evaluation_num']
sheet1.cell(num, 12).value = map['location_price']
book.save(filename="../景区信息.xlsx")
保存景区评论
def save_comment_information(map,flag,num,booksheet):
if flag == True:
booksheet.write(0, 0, '评论时间')
booksheet.write(0, 1, '评价分数')
booksheet.write(0, 2, '评论详细')
booksheet.write(num, 0, map['comment_date'])
booksheet.write(num, 1, map['comment_star'])
booksheet.write(num, 2, map['comment_detail'])
依次爬取景点及获取信息html节点
依次获取景点
我的景区文件中是获取了原始景区名称的,所以我直接拿出来然后搜索就是了。你们要是想一个一个爬,就搜索北京景区这种。然后一页一页爬,方法都差不多。
def get_location():
location_now = 0
for num in range(2, len_row+1):
location_original_name = sheet1.cell(num, 1).value
location_city = sheet1.cell(num,2).value
location_now += 1
if location_now <= now_sum:
continue
brower.get(url + 's/' + location_original_name + '/')
time.sleep(1)
now_url = brower.current_url
print(now_url)
if (len(now_url) >= 61 and now_url[:61] == login_url) or now_url == lost_url:
login(location_original_name)
time.sleep(3)
try:
get_basis_information(location_original_name,location_city)
except Exception as e:
print(e)
print('失败')
lost_list.append(location_original_name)
except_solve()
time.sleep(5)
获取景区基本信息
def get_basis_information(location_name,location_city):
basis_locations = WebDriverWait(brower, max_wait_time).until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@class="common-list-main"]')))
print('one')
img_locations = brower.find_elements_by_xpath('//*[@class="common-list-main"]//*[@class="default-list-item clearfix"]/a/img')
print('two')
click_locations = brower.find_elements_by_xpath('//*[@class="common-list-main"]//*[@class="default-list-item clearfix"]/a')
print('three')
img_save_name = ''
for item in basis_locations:
List = item.text.split()
if List[0] == '对不起,没有符合条件的商家':
lost_list.append(location_name)
print(lost_list)
break
idea_tag = '(.*?)|.*?'
idea_evaluation = '(.*?)分(.*?)人.*?'
result_tag = re.findall(idea_tag, List[3], re.S)
result_evaluation = re.findall(idea_evaluation, List[2], re.S)
name_tag = ''
img_save_name = List[1]
for i in result_tag:
if i != '|':
name_tag += i
else:
break
map = {
'location_original_name': location_name,
'location_name': List[1],
'location_score': result_evaluation[0][0] + '分',
'location_evaluation_num': result_evaluation[0][1],
'location_tag': name_tag,
'location_price': List[5]
}
save_basis_information(map)
break
for item in img_locations:
img_url = item.get_attribute('src')
save_img(img_url, img_save_name)
break
for item in click_locations:
item.click()
get_comments_information(location_name,location_city)
break
time.sleep(1)
获取景区评论
def get_comments_information(location_name,location_city):
brower.switch_to.window(brower.window_handles[1])
page_num = 0
comment_sum = 0
csv_name = True
flag = True
workbook = xlwt.Workbook(encoding='utf-8')
booksheet = workbook.add_sheet('Sheet1', cell_overwrite_ok=True)
try:
while flag:
flag = False
page_num += 1
if page_num > 30:
break
comments = WebDriverWait(brower, max_wait_time).until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@class="comment-main"]')))
print('four')
star = WebDriverWait(brower, max_wait_time).until(
EC.presence_of_all_elements_located((By.XPATH, '//*[@class="comment-main"]//*[@class="rate-stars-ul rate-stars-light"]')))
print('five')
next_button = WebDriverWait(brower, max_wait_time).until(
EC.presence_of_all_elements_located(
(By.XPATH, '//*[@class="pagination-item pagination-item-comment next-btn active"]')))
print('six')
read_button = brower.find_elements_by_xpath('//*[@class="read-btn"]')
print('seven')
for item in read_button:
item.click()
detail_comments = brower.find_elements_by_xpath('//*[@class="user-comment"]')
detail_comments_date = brower.find_elements_by_xpath('//*[@class="comment-date"]')
detail_comments_list = []
detail_comments_date_list = []
star_list = []
for item in detail_comments:
comment = item.text
detail_comments_list.append(comment)
for item in detail_comments_date:
date = item.text
detail_comments_date_list.append(date)
for item in star:
star_num = item.get_attribute('style')
flag1 = False
star_num_1 = ''
for i in star_num:
if i == '%':
break
if flag1 == True:
star_num_1 += i
if i == ' ':
flag1 = True
num = (int(star_num_1)) / 100 * 5
star_list.append(num)
for i in range(len(detail_comments)):
comment_sum += 1
map = {
'comment_detail' : detail_comments_list[i],
'comment_date' : detail_comments_date_list[i],
'comment_star' : star_list[i]
}
save_comment_information('../data/景区评论/' + location_name + '.xls', map, csv_name, comment_sum,workbook,booksheet)
csv_name = False
for item in next_button:
item.click()
flag = True
time.sleep(1)
except Exception as e:
print(e)
finally:
workbook.save('../data/景区评论/' + location_city + '/' + location_name + '.xls')
time.sleep(1)
brower.close()
brower.switch_to.window(brower.window_handles[0])
本次爬取所用的源代码
上面的cookie记得自己获取 由于也是遍写边改,可能逻辑有点乱,一些地方写得啰嗦重复。也不想改了,这里就分享一下源代码叭。我预处理的东西比较多哦,得看一下,你们才能直接用。
源码链接
|