[Python知识库] Python-Seleniu批量爬取知网论文

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> Python-Seleniu批量爬取知网论文 -> 正文阅读

[Python知识库]Python-Seleniu批量爬取知网论文

selenium是很简单的爬虫工具，原理就是根据HTML代码，搜索到对应的标签然后模拟手动点击操作，HTML源码右击网页点击检查然后手动找到需要操作的元素。

第一步

下载chrome对应版本的webdriver，我的环境是macos，因此参考了如下文章：Mac安装Chromedriver

第二步

学会selenium基本语法，找个博客参考一下就行了。一般有几种定位方式，相对定位一般来说好用一些，如果使用绝对定位可能因为url的刷新搜索不到对应地址的标签。

Python Selenium库的使用

第三步

有一些注意事项：

因为网页刷新有延迟时间，因此需要睡眠一定时间等待网页刷新，否则可能经常抛出异常
有时候还是定位不到对应的标签，如果遇到异常就尝试再次定位，当然也有可能本身该标签元素不存在
知网对爬虫有一定监测，有时会异常结束，有时需要输入验证码。异常结束采取的方法是记录上次爬到了第几篇文章，下次运行定位到下一篇文章开始爬取。验证码这个需要手动输入，速度快点，因为有时仅仅是翻了好多页就需要输入验证码。

代码参考

import csv
import time
from selenium import webdriver
from selenium.webdriver.common.by import By

# 校园vpn知网入口，这一部分根据自身学校自行调整

'''
driver = webdriver.Chrome()
driver.maximize_window()
driver.get('vpn链接')
account = '学号'
password = '密码'
input_account = driver.find_element(By.ID, 'user_login')
input_account.send_keys(account)
input_password = driver.find_element(By.ID, 'user_password')
input_password.send_keys(password)
time.sleep(5)
login_button = driver.find_element(By.NAME, 'commit')
login_button.click()


# 进入知网入口
href = driver.find_element(By.CLASS_NAME, 'date_show_2').find_elements(By.TAG_NAME, 'dl')[4].find_element(By.TAG_NAME, 'a')
href.click()
time.sleep(5)
'''


# 进入知网的高级检索目录
advanced_search = driver.find_element(By.ID, 'highSearch')
advanced_search.click()
time.sleep(5)
driver.switch_to.window(driver.window_handles[0])
driver.close()
driver.switch_to.window(driver.window_handles[0])
time.sleep(2)


# 点击学术期刊，然后点击cssci
qikan_button = driver.find_element(By.CLASS_NAME, 'doctype-menus').find_element(By.NAME, 'dbcode')
qikan_button.click()
time.sleep(3)
cssci = driver.find_element(By.XPATH, "//div[@class='extend-tit-labels' and @id='JournalSourceType']/label[5]/input")
cssci.click()

# 输入检索语句
input_key_words = driver.find_element(By.CLASS_NAME, 'input-box').find_element(By.TAG_NAME, 'input')
# key_words = input("输入检索语句：")  # 手动输入
key_words = "XXXXX"
input_key_words.send_keys(key_words)

# 点击检索按钮
search_button = driver.find_element(By.CLASS_NAME, 'search-buttons').find_element(By.TAG_NAME, 'input')
search_button.click()
time.sleep(20)

# # 点击设置检索年限（出了问题，这里采用的策略是点击检索后进程睡眠较长一段时间，手动设置年限）
# driver.find_element(By.ID, 'divGroup').find_elements(By.TAG_NAME, 'dl')[2].click()
# time.sleep(1)
# year_set = driver.find_element(By.ID, 'divGroup').find_elements(By.TAG_NAME, 'dl')[2].find_element(By.XPATH, "//ul/li[4]/input")
# year_set.click()
# time.sleep(2)

# 得到一共有多少条结果以及页数
results = driver.find_element(By.CLASS_NAME, 'pagerTitleCell').find_element(By.TAG_NAME, 'em').text
string = ""
for chr in results:
    if '0' <= chr and chr <= '9':
        string += chr
num_results = int(string)
num_pages = int(num_results/20)
if num_results % 20 != 0:
    num_pages += 1

count = 0
preNum = 2561

while True:
    ok = False
    try:
        next_pages = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')
    except Exception:
        time.sleep(10)
        next_pages = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')
    for i in range(2, len(next_pages) - 1):
        page = int(next_pages[i].text)
        if ((page - 1) * 20 < preNum) and (preNum <= page * 20):
            ok = True
            count = (page - 1) * 20
            next_pages[i].click()
            break
    if ok:
        break
    else:
        next_pages[-2].click()
        time.sleep(5)

# 设置好csv文件
csv_file = open("/Users/lianyuhao/projects/WordCloud/csv/minzurentong.csv", 'a')
csv_writer = csv.writer(csv_file)
if preNum == 0:
    csv_writer.writerow(['编号', '文章名', '作者', '文献来源', '关键词', '摘要'])

# 提取出每页每篇文档的信息写入csv文件
for j in range(0, num_pages, 1):
    time.sleep(5)
    seq_tags = driver.find_elements(By.CLASS_NAME, 'seq')
    author_tags = driver.find_elements(By.CLASS_NAME, 'author')  # 作者
    source_tags = driver.find_elements(By.CLASS_NAME, 'source')  # 来源
    time.sleep(2)

    for i in range(len(seq_tags)):
        count = count + 1
        if count < preNum:
            continue
        # 获取url可能页面刷新后新旧元素异常，如果异常尝试获取两次
        # 获取编号
        try:
            index = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('data-cur')
        except Exception:
            time.sleep(3)
            index = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('data-cur')
        time.sleep(2)
        temp = []
        # 获取作者
        try:
            author_a_tags = author_tags[i].find_elements(By.TAG_NAME, 'a')
            for k in author_a_tags:
                temp.append(k.text)
        except Exception:
            time.sleep(3)
            author_a_tags = author_tags[i].find_elements(By.TAG_NAME, 'a')
            for k in author_a_tags:
                temp.append(k.text)
        authors = ';'.join(temp)
        time.sleep(2)
        # 获取期刊来源
        try:
            source = source_tags[i].find_element(By.TAG_NAME, 'a').text
        except Exception:
            time.sleep(3)
            source = source_tags[i].find_element(By.TAG_NAME, 'a').text
        time.sleep(2)
        # 获取每篇文章的url
        try:
            target_str = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('value')
        except Exception:
            time.sleep(3)
            target_str = seq_tags[i].find_element(By.TAG_NAME, 'input').get_attribute('value')
        # 这里的value以"!"分割dbname和filename，取出前两个即可
        dbcode = target_str[0:4]
        split_string = target_str.split("!")
        dbname = split_string[0]
        filename = split_string[1]
        url = 'https://kns-cnki-net-443.webvpn.scuec.edu.cn/kcms/detail/detail.aspx?dbcode=' + dbcode + '&dbname=' + dbname + '&filename=' + filename
        # 进入url链接
        driver.execute_script('window.open()')
        driver.switch_to.window(driver.window_handles[-1])  # 切换到最新页面
        driver.get(url)
        # 获取文章标题
        try:
            title = driver.find_element(By.CLASS_NAME, "wx-tit").find_element(By.TAG_NAME, 'h1').text
        except Exception:
            time.sleep(3)
            title = driver.find_element(By.CLASS_NAME, "wx-tit").find_element(By.TAG_NAME, 'h1').text
        time.sleep(2)
        # 获取文章摘要
        try:
            summary = driver.find_element(By.CLASS_NAME, "abstract-text").text
        except Exception:
            summary = ""
        time.sleep(2)
        keywords = ""
        # 获取关键字
        try:
            keywords_tags = driver.find_element(By.CLASS_NAME, "keywords").find_elements(By.TAG_NAME, 'a')
            for k in keywords_tags:
                keywords = keywords + k.text
        except Exception:
            pass
        time.sleep(2)
        csv_writer.writerow([index, title, authors, source, keywords, summary])
        print("第" + index + "篇论文爬取成功！")
        driver.close()
        # 关闭当前页面后要再次切换为第一个页面否则会bug
        driver.switch_to.window(driver.window_handles[0])

    time.sleep(5)
    # 翻到下一页，若有 x 页则翻页 x - 1 次
    if j < num_pages - 1:
        try:
            next_page = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')[-1]
        except Exception:
            time.sleep(3)
            next_page = driver.find_element(By.CLASS_NAME, 'pages').find_elements(By.TAG_NAME, 'a')[-1]
        next_page.click()
        time.sleep(5)
driver.close()