selenium简介
如果是搞过爬虫的朋友应该会对一些网站的反爬措施印象深刻,由于对方服务器有各种请求限制,导致爬虫的效率大大降低,甚至有被封号的危险。这个时候,selenium的优点就凸显出来了,selenium是一个用来做浏览器自动测试的工具,本身与爬虫无关,不过由于它的强大功能,常常被用来搞爬虫。通过selenium模拟浏览器操作,实现“可见即可爬”,即使是客户端渲染的页面,也不需要解析ajax请求,直接通过webdriver获取数据就可以了。 以下分别是爬取51job招聘数据和企查查工商信息数据的脚本,因为网站会经常变换页面结构(为了反爬),等你看到这篇文章的时候估计代码里面的xpath地址全都失效了,需要手动更换一下。
获取招聘信息
#!/usr/bin/env python3
import random
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
def find_element_with_xpath(obj, path):
try:
result = obj.find_element(By.XPATH, path).text
except Exception as e:
print(e)
result = '-'
return result
else:
return result
def find_element_with_class(obj, class_name):
try:
result = obj.find_element(By.CLASS_NAME, class_name).text
except Exception as e:
print(e)
result = '-'
return result
else:
return result
def get_web_object():
"""
获取浏览器对象
:return:
"""
web = Chrome()
web.get('https://www.51job.com')
web.maximize_window()
print(web.title)
print(web.name)
return web
initialize = True
def get_company_info(web, search_company_name):
"""
根据公司名称搜素信息并保存到CSV
:param web: web浏览器对象
:param search_company_name: 公司名称
:return:
"""
global initialize
print('#' * 100)
print(f'当前收集信息的企业是:{search_company_name}')
random_sleep = random.randint(3, 5)
time.sleep(random_sleep)
if initialize:
kw = web.find_element(By.XPATH, '//*[@id="kwdselectid"]')
initialize = False
else:
kw = web.find_element(By.XPATH, '//*[@id="keywordInput"]')
kw.clear() # 清空之前的输入
kw.send_keys(f'{search_company_name}', Keys.ENTER) # 输入并回车
def save_job_list(job_list):
"""保存工作列表内的信息"""
print('-' * 100)
print('len(job_list)', len(job_list))
count = 1
for job in job_list:
print('count', count)
job_name = find_element_with_xpath(job, './a/p[1]/span[1]')
release_time = find_element_with_xpath(job, './a/p[1]/span[2]')
salary = find_element_with_class(job, 'sal')
condition = find_element_with_xpath(job, './a/p[2]/span[2]')
condition_list = condition.split('|')
location = condition_list[0].strip()
experience, need_people, education = '-', '-', '-'
for c in condition_list[1:]:
if '年' in c or '经验' in c:
experience = c
continue
if '招' in c or '人' in c:
need_people = c
else:
education = c
welfare = find_element_with_xpath(job, './a/p[3]').replace('\n', ',')
company_full_name = find_element_with_xpath(job, './div[2]/a')
company_info = find_element_with_xpath(job, './div[2]/p[1]').split('|')
stuff_nums, company_type = '-', '-'
for c in company_info:
if '人' in c:
stuff_nums = c
continue
else:
company_type = c
industry = find_element_with_xpath(job, './div[2]/p[2]')
count += 1
info_list = [search_company_name, release_time, job_name, location, salary, experience, education,
need_people, welfare, company_full_name,
stuff_nums, company_type, industry]
info_string = '|'.join(info_list)
info_string += '\n'
print(f'info_string:\n{info_string}\n')
save_file_name = './company_information.csv'
with open(save_file_name, 'a+', encoding='utf-8') as f_obj:
f_obj.write(info_string)
# 找到存放数据的位置
time.sleep(3)
# 第一页的job_list
job_lists = web.find_elements(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div')
if len(job_lists) == 0:
return
# 第一页
save_job_list(job_lists)
# 判断页数
ul = web.find_element(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/ul')
lis = ul.find_elements(By.TAG_NAME, 'li')
if len(lis) <= 3:
return
# 下一页
next_button = web.find_element(By.CLASS_NAME, 'next')
cur_page_num = int(ul.find_element(By.CLASS_NAME, 'on').text) if ul.find_element(By.CLASS_NAME, 'on') else 999
print('cur_page_num', cur_page_num)
while cur_page_num < len(lis) - 2:
try:
next_button.click()
time.sleep(3)
job_lists = web.find_elements(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div')
# 判断页数
ul = web.find_element(By.XPATH, '/html/body/div[2]/div[3]/div/div[2]/div[4]/div[2]/div/div/div/ul')
lis = ul.find_elements(By.TAG_NAME, 'li')
cur_page_num = int(ul.find_element(By.CLASS_NAME, 'on').text) if ul.find_element(By.CLASS_NAME,
'on') else 999
print('cur_page_num', cur_page_num)
except Exception as e:
print(e)
return
else:
save_job_list(job_lists)
def company_info_collect(company_list):
"""
根据公司名称收集相关信息
:param company_list:
:return:
"""
web = get_web_object()
for company_name in company_list:
get_company_info(web, company_name)
if __name__ == '__main__':
company_name_list = [] # 需要爬取的公司名称列表
company_info_collect(company_name_list)
获取企业工商注册信息
#!/usr/bin/env python3
import random
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
def find_element_with_xpath(obj, path):
try:
result = obj.find_element(By.XPATH, path).text
except Exception as e:
print(e)
result = '-'
return result
else:
return result
def find_element_with_class(obj, class_name):
try:
result = obj.find_element(By.CLASS_NAME, class_name).text
except Exception as e:
print(e)
result = '-'
return result
else:
return result
def get_web_object():
"""
获取浏览器对象
:return:
"""
web = Chrome()
web.get('https://www.qcc.com/')
web.maximize_window()
web.refresh()
print(web.title)
print(web.name)
return web
initialize = True
def get_company_business_info(web, search_company_name):
"""
根据公司名称搜素信息并保存到CSV
:param web: web浏览器对象
:param search_company_name: 公司名称
:return:
"""
global initialize
print('#' * 100)
print(f'当前收集信息的企业是:{search_company_name}')
random_sleep = random.randint(3, 5)
time.sleep(random_sleep)
if initialize:
kw = web.find_element(By.XPATH, '//*[@id="searchKey"]')
initialize = False
else:
kw = web.find_element(By.XPATH, '//*[@id="searchKey"]')
kw.clear() # 清空之前的输入
kw.send_keys(f'{search_company_name}', Keys.ENTER) # 输入并回车
# 找到存放数据的位置
time.sleep(3)
# 获取第一个搜索结果
first_result = None
try:
first_result = web.find_element(By.XPATH, '/html/body/div[1]/div[2]/div[2]/div[3]/div/div[2]/div/table/tr[1]')
except Exception as e:
print(e)
try:
first_result = web.find_element(By.XPATH,
'/html/body/div[1]/div[2]/div[2]/div[4]/div/div[2]/div/table/tr[1]')
except Exception as e:
print(e)
if not first_result:
info_list = [search_company_name, '-', '-', '-', '-', '-', '-', '-']
else:
company_full_name = find_element_with_xpath(first_result,
'./td[3]/div/div[1]/span[1]/a')
registered_capital = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[1]/span[2]/span')
legal_person = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[1]/span[1]/span/span/a')
establish_day = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[1]/span[3]/span')
credit_code = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[1]/span[4]/span/div/span[1]')
location = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[3]/span/div/span[1]')
official_website = find_element_with_xpath(first_result,
'./td[3]/div/div[4]/div[2]/span[3]/span/a')
info_list = [search_company_name, company_full_name, registered_capital, legal_person, establish_day,
credit_code,
location, official_website]
info_string = '|'.join(info_list)
info_string += '\n'
print(f'info_string:\n{info_string}\n')
save_file_name = './company_business_info.csv'
with open(save_file_name, 'a+', encoding='utf-8') as f_obj:
f_obj.write(info_string)
def company_info_collect(company_list):
"""
根据公司名称收集相关信息
:param company_list:
:return:
"""
web = get_web_object()
for company_name in company_list:
get_company_business_info(web, company_name)
if __name__ == '__main__':
company_name_list = [] # 需要爬取信息的公司名称列表
company_info_collect(company_name_list)
|