[Python知识库] Python qichacha 上市公司专利爬虫

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> Python qichacha 上市公司专利爬虫 -> 正文阅读

[Python知识库]Python qichacha 上市公司专利爬虫

from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
import csv
import re
import numpy as np
import os

afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'}
# driver = webdriver.PhantomJS(executable_path=r'D:\code\patent_info\phantomjs-2.1.1-windows\bin\phantomjs.exe', service_args=['--ignore-ssl-errors=true', '--ssl-protocol=TLSv1'])
chrome_driver = r'D:\code\patent_info\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(r'--user-data-dir=D:\code\patent_info\ChromeUserqichacha0623')
driver = webdriver.Chrome(executable_path = chrome_driver, options=chrome_options)
# driver.maximize_window() 
# # driver.add_argument('--headless')  # 开启无界面模式 

def get_company_message(company):
    '''
    Input: company is a chinese word
    Todo: get company code and  patent url 
    Return: patent url 
    '''
    driver.get('https://www.qcc.com/search?key={}'.format(company))
    time.sleep(5)
    print('https://www.qcc.com/search?key={}'.format(company))
    html_page = driver.page_source
    soup = BeautifulSoup(html_page,features="lxml")
    href = soup.find_all('a',{'class': 'title'})[0].get('href')
    print('href is {}'.format(href))
    href2 = href.replace('firm', 'cassets')
    print('href2 is {}'.format(href2))
    return href2

def iselementExist(element):
    '''
    Input: xpath of patent_table element
    Todo: Whether the patent exists
    Return: bools
    '''
    flag = True
    try:
        driver.find_element_by_xpath(element)
        return flag
    except:
        flag=False
        return flag
    # //*[@id="zhuanlilist"]/div[1]/h3
def get_patent_infomation(number_page):# get information single web
    '''
    Input: href2 get from 'get_company_message' 
    Todo: get patents information from table
    Return: soup.select
    '''
    number_page = BeautifulSoup(number_page,features="lxml")
    data_infos = number_page.select('#zhuanlilist .app-ntable td')
    return data_infos

def save_patient(data_infos, key_company):
    '''
    Input: 1.data_infos: get one page information from 'get_patent_infomation'
           2.key_company: company name read from excel
    Todo: explain and write input to csv
    return: None
    '''
    company_patient = []
    for info in data_infos:
        company_patient.append(info.text)
    company_patient_classified =  [company_patient[i:i+10] for i in range(0,len(company_patient),10)]
    company_patient_classified = np.array(company_patient_classified) # 使用numpy中的array，将列表转化为标准的数组s
    dataframe = pd.DataFrame({  'company': key_company,
                                'Number': company_patient_classified[:,0],
                                'Patent_name': company_patient_classified[:,1],
                                'Patent_type': company_patient_classified[:,2],
                                'Patent_statu': company_patient_classified[:,3],
                                'Application number': company_patient_classified[:,4],
                                'Apply_data': company_patient_classified[:,5],
                                'Public_announcement_No': company_patient_classified[:,6],
                                'Public_announcement_Data': company_patient_classified[:,7],
                                'Inventor': company_patient_classified[:,8],
                                'More': company_patient_classified[:,9]
                                })
    if not os.path.exists('D:\code\patent_info\data\output_data\company_patient1.csv'):
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030')
    else:
        dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030',header=False)

def whether_turn_page(element1):
    '''
    Input: xpath of patent_table element
    Todo: Whether the pages_number table exists
    Return: bools
    '''
    flag1 = True
    try:
        driver.find_element_by_xpath(element1)
        return flag1
    except:
        flag1=False
        return flag1
  # //*[@id="zhuanlilist"]/div[4]/nav/ul

def turn_next_page(key_company):  
    '''
    Input: key_company: company name read from excel
    Todo: design how to turn next page in defferent condition
    return: key_company: company name read from excel
    '''
    print(driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text)
    list_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text
    # num_max = int(re.sub("\D", "", list_max)) # just keep int number
    if list_max == '>': # patent number between (10,70]
        list_second_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').text
        for turn_index in range(int(list_second_max) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    elif int(re.sub("\D", "", list_max)):
    # elif isinstance(num_max, int):# patent number above 70
        for turn_index in range(int(re.sub("\D", "", list_max)) - 1):
            driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').click()
            time.sleep(1)
            number_page = driver.page_source
            data_infos = get_patent_infomation(number_page)
            save_patient(data_infos, key_company)
    else:
        print('error company name is {}'.format(key_company))

    #next_page_button //*[@id="zhuanlilist"]/div[4]/nav/ul/li[8]/a 
if __name__ == '__main__':
    csv_file = r"D:\code\patent_info\patient1.csv"
    with open(csv_file, encoding='utf-8') as csvfile:
        reader=csv.reader(csvfile)
        for i,key_company in enumerate(reader):
            print('i is {}'.format(i))
            print('rows is {}'.format(key_company))
            # print(type(key_company))
            key_company = ' '.join(key_company)
            patent_url = get_company_message(key_company) # patent url
            driver.get(patent_url)
            time.sleep(1)
            if iselementExist('//*[@id="zhuanlilist"]/div[1]/h3'):
                number_page = driver.page_source
                data_infos = get_patent_infomation(number_page)
                save_patient(data_infos, key_company)
                if whether_turn_page('//*[@id="zhuanlilist"]/div[4]/nav/ul'):
                    turn_next_page(key_company)