selenium是很简单的爬虫工具,原理就是根据HTML代码,搜索到对应的标签然后模拟手动点击操作,HTML源码右击网页点击检查然后手动找到需要操作的元素。
第一步
下载chrome对应版本的webdriver,我的环境是macos,因此参考了如下文章:Mac安装Chromedriver
第二步
学会selenium基本语法,找个博客参考一下就行了。一般有几种定位方式,相对定位一般来说好用一些,如果使用绝对定位可能因为url的刷新搜索不到对应地址的标签。
Python Selenium库的使用
第三步
有一些注意事项:
- 因为网页刷新有延迟时间,因此需要睡眠一定时间等待网页刷新,否则可能经常抛出异常
- 有时候还是定位不到对应的标签,如果遇到异常就尝试再次定位,当然也有可能本身该标签元素不存在
- 知网对爬虫有一定监测,有时会异常结束,有时需要输入验证码。异常结束采取的方法是记录上次爬到了第几篇文章,下次运行定位到下一篇文章开始爬取。验证码这个需要手动输入,速度快点,因为有时仅仅是翻了好多页就需要输入验证码。
代码参考
import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
'''
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('vpn链接')
account = '学号'
password = '密码'
input_account = driver.find_element(By.ID, 'user_login')
input_account.send_keys(account)
input_password = driver.find_element(By.ID, 'user_password')
input_password.send_keys(password)
time.sleep(5)
login_button = driver.find_element(By.NAME, 'commit')
login_button.click()
# 进入知网入口
href = driver.find_element(By.CLASS_NAME, 'date_show_2').find_elements(By.TAG_NAME, 'dl')[4].find_element(By.TAG_NAME, 'a')
href.click()
time.sleep(5)
'''
advanced_search = driver.find_element(By.ID, 'highSearch')
advanced_search.click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[0])
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(2)
qikan_button = driver.find_element(By.CLASS_NAME, 'doctype-menus').find_element(By.NAME, 'dbcode')
qikan_button.click()
time.sleep(3)
cssci = driver.find_element(By.XPATH, "//div[@class='extend-tit-labels' and @id='JournalSourceType']/label[5]/input")
cssci.click()
input_key_words = driver.find_element(By.CLASS_NAME, 'input-box').find_element(By.TAG_NAME, 'input')
key_words = "XXXXX"
input_key_words.send_keys(key_words)
search_button = driver.find_element(By.CLASS_NAME, 'search-buttons').find_element(By.TAG_NAME, 'input')
search_button.click()
time.sleep(20)
results = driver.find_element(By.CLASS_NAME, 'pagerTitleCell').find_element(By.TAG_NAME, 'em').text
string = ""
for chr in results:
if '0' <= chr and chr <= '9':
string += chr
num_results = int(string)
num_pages = int(num_results/20)
if num_results % 20 != 0:
num_pages += 1
count = 0
preNum = 2561
while True:
ok = False
try:
next_pages = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')
except Exception:
time.sleep(10)
next_pages = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')
for i in range(2, len(next_pages) - 1):
page = int(next_pages[i].text)
if ((page - 1) * 20 < preNum) and (preNum <= page * 20):
ok = True
count = (page - 1) * 20
next_pages[i].click()
break
if ok:
break
else:
next_pages[-2].click()
time.sleep(5)
csv_file = open("/Users/lianyuhao/projects/WordCloud/csv/minzurentong.csv", 'a')
csv_writer = csv.writer(csv_file)
if preNum == 0:
csv_writer.writerow(['编号', '文章名', '作者', '文献来源', '关键词', '摘要'])
for j in range(0, num_pages, 1):
time.sleep(5)
seq_tags = driver.find_elements(By.CLASS_NAME, 'seq')
author_tags = driver.find_elements(By.CLASS_NAME, 'author')
source_tags = driver.find_elements(By.CLASS_NAME, 'source')
time.sleep(2)
for i in range(len(seq_tags)):
count = count + 1
if count < preNum:
continue
try:
index = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('data-cur')
except Exception:
time.sleep(3)
index = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('data-cur')
time.sleep(2)
temp = []
try:
author_a_tags = author_tags[i].find_elements(By.TAG_NAME, 'a')
for k in author_a_tags:
temp.append(k.text)
except Exception:
time.sleep(3)
author_a_tags = author_tags[i].find_elements(By.TAG_NAME, 'a')
for k in author_a_tags:
temp.append(k.text)
authors = ';'.join(temp)
time.sleep(2)
try:
source = source_tags[i].find_element(By.TAG_NAME, 'a').text
except Exception:
time.sleep(3)
source = source_tags[i].find_element(By.TAG_NAME, 'a').text
time.sleep(2)
try:
target_str = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('value')
except Exception:
time.sleep(3)
target_str = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('value')
dbcode = target_str[0:4]
split_string = target_str.split("!")
dbname = split_string[0]
filename = split_string[1]
url = 'https://kns-cnki-net-443.webvpn.scuec.edu.cn/kcms/detail/detail.aspx?dbcode=' + dbcode + '&dbname=' + dbname + '&filename=' + filename
driver.execute_script('window.open()')
driver.switch_to.window(driver.window_handles[-1])
driver.get(url)
try:
title = driver.find_element(By.CLASS_NAME, "wx-tit").find_element(By.TAG_NAME, 'h1').text
except Exception:
time.sleep(3)
title = driver.find_element(By.CLASS_NAME, "wx-tit").find_element(By.TAG_NAME, 'h1').text
time.sleep(2)
try:
summary = driver.find_element(By.CLASS_NAME, "abstract-text").text
except Exception:
summary = ""
time.sleep(2)
keywords = ""
try:
keywords_tags = driver.find_element(By.CLASS_NAME, "keywords").find_elements(By.TAG_NAME, 'a')
for k in keywords_tags:
keywords = keywords + k.text
except Exception:
pass
time.sleep(2)
csv_writer.writerow([index, title, authors, source, keywords, summary])
print("第" + index + "篇论文爬取成功!")
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(5)
if j < num_pages - 1:
try:
next_page = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')[-1]
except Exception:
time.sleep(3)
next_page = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')[-1]
next_page.click()
time.sleep(5)
driver.close()
|