from bs4 import BeautifulSoup
import requests
import time
import pandas as pd
from selenium import webdriver
import csv
import re
import numpy as np
import os
afterLogin_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36'}
chrome_driver = r'D:\code\patent_info\chromedriver.exe'
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument(r'--user-data-dir=D:\code\patent_info\ChromeUserqichacha0623')
driver = webdriver.Chrome(executable_path = chrome_driver, options=chrome_options)
def get_company_message(company):
'''
Input: company is a chinese word
Todo: get company code and patent url
Return: patent url
'''
driver.get('https://www.qcc.com/search?key={}'.format(company))
time.sleep(5)
print('https://www.qcc.com/search?key={}'.format(company))
html_page = driver.page_source
soup = BeautifulSoup(html_page,features="lxml")
href = soup.find_all('a',{'class': 'title'})[0].get('href')
print('href is {}'.format(href))
href2 = href.replace('firm', 'cassets')
print('href2 is {}'.format(href2))
return href2
def iselementExist(element):
'''
Input: xpath of patent_table element
Todo: Whether the patent exists
Return: bools
'''
flag = True
try:
driver.find_element_by_xpath(element)
return flag
except:
flag=False
return flag
def get_patent_infomation(number_page):
'''
Input: href2 get from 'get_company_message'
Todo: get patents information from table
Return: soup.select
'''
number_page = BeautifulSoup(number_page,features="lxml")
data_infos = number_page.select('#zhuanlilist .app-ntable td')
return data_infos
def save_patient(data_infos, key_company):
'''
Input: 1.data_infos: get one page information from 'get_patent_infomation'
2.key_company: company name read from excel
Todo: explain and write input to csv
return: None
'''
company_patient = []
for info in data_infos:
company_patient.append(info.text)
company_patient_classified = [company_patient[i:i+10] for i in range(0,len(company_patient),10)]
company_patient_classified = np.array(company_patient_classified)
dataframe = pd.DataFrame({ 'company': key_company,
'Number': company_patient_classified[:,0],
'Patent_name': company_patient_classified[:,1],
'Patent_type': company_patient_classified[:,2],
'Patent_statu': company_patient_classified[:,3],
'Application number': company_patient_classified[:,4],
'Apply_data': company_patient_classified[:,5],
'Public_announcement_No': company_patient_classified[:,6],
'Public_announcement_Data': company_patient_classified[:,7],
'Inventor': company_patient_classified[:,8],
'More': company_patient_classified[:,9]
})
if not os.path.exists('D:\code\patent_info\data\output_data\company_patient1.csv'):
dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030')
else:
dataframe.to_csv("D:\code\patent_info\data\output_data\company_patient1.csv", index=False, sep=',', mode='a',encoding='gb18030',header=False)
def whether_turn_page(element1):
'''
Input: xpath of patent_table element
Todo: Whether the pages_number table exists
Return: bools
'''
flag1 = True
try:
driver.find_element_by_xpath(element1)
return flag1
except:
flag1=False
return flag1
def turn_next_page(key_company):
'''
Input: key_company: company name read from excel
Todo: design how to turn next page in defferent condition
return: key_company: company name read from excel
'''
print(driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text)
list_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').text
if list_max == '>':
list_second_max = driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').text
for turn_index in range(int(list_second_max) - 1):
driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()]').click()
time.sleep(1)
number_page = driver.page_source
data_infos = get_patent_infomation(number_page)
save_patient(data_infos, key_company)
elif int(re.sub("\D", "", list_max)):
for turn_index in range(int(re.sub("\D", "", list_max)) - 1):
driver.find_element_by_xpath('//*[@id="zhuanlilist"]/div[4]/nav/ul/li[last()-1]/a').click()
time.sleep(1)
number_page = driver.page_source
data_infos = get_patent_infomation(number_page)
save_patient(data_infos, key_company)
else:
print('error company name is {}'.format(key_company))
if __name__ == '__main__':
csv_file = r"D:\code\patent_info\patient1.csv"
with open(csv_file, encoding='utf-8') as csvfile:
reader=csv.reader(csvfile)
for i,key_company in enumerate(reader):
print('i is {}'.format(i))
print('rows is {}'.format(key_company))
key_company = ' '.join(key_company)
patent_url = get_company_message(key_company)
driver.get(patent_url)
time.sleep(1)
if iselementExist('//*[@id="zhuanlilist"]/div[1]/h3'):
number_page = driver.page_source
data_infos = get_patent_infomation(number_page)
save_patient(data_infos, key_company)
if whether_turn_page('//*[@id="zhuanlilist"]/div[4]/nav/ul'):
turn_next_page(key_company)
|