?51job爬取完整代码:
import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
# 获取职位信息
def jobMesssage(item):
df = pd.DataFrame()
item.list = item.find_all('a', attrs={'class': 'el'}) # 获取招聘岗位信息
for i, item in enumerate(item.list):
try:
df['招聘职位网址'] = item.get('href'),
df['岗位名称'] = item.find_all('span')[0].text,
df['发布日期'] = item.find_all('span')[1].text,
df['薪资'] = item.find_all('span')[2].text, #
df['工作地及要求'] = item.find_all('span')[3].text, #
# df_all=pd.concat([df,df_all],axis=1)
item.list = item.find_all('p', attrs={'class': 'tags'})
for i, item.list in enumerate(item.list):
df['福利'] = item.list.get('title'), #
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
# 获取职位对应公司信息
def jobFirm(item):
df = pd.DataFrame()
item.list = item.find_all('div', attrs={'class': 'er'}) # 获取招聘公司信息
for i, item in enumerate(item.list):
# print(item,i,sep=',')
# print(item.find_all('p')[1].text)
try:
df['招聘公司网址'] = item.find('a').get('href'),
df['公司名称'] = item.find('a').text,
df['公司规模'] = item.find_all('p')[0].text,
df['所属行业'] = item.find_all('p')[0].text,
print(str(i), '招聘公司写入正常')
except:
print(str(i), '招聘公司写入异常')
return df
# 职位要求
def jobRequire(html):
df = pd.DataFrame()
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'tHeader tHjob'})
html.list = html.find_all('div', attrs={'class': 'tCompany_main'})
# print(html.list)
for i, item in enumerate(html.list):
try:
# contactInf=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('span').text.strip('') #联系方式
# officeAddress=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('p').text#上班地址
jobRequir_a = item.find('div', attrs={'class': 'tBorderTop_box'}).text.strip('').replace('\n', '').replace(
'\t', '') \
.replace(' ', '') # 任职要求
# print(jobRequir_a, i, sep='')
item.list = item.find('div', attrs={'class': 'tBorderTop_box'}).find_all('p')
jobRequir = [] # 职位要求
for i, item in enumerate(item.list):
jobRequir.append(item.text.strip('') + '\n')
jobRequirText = ''.join(jobRequir)
# print(jobRequirText)
# print(jobRequirText.find('任职要求'))
if jobRequirText.find('任职要求') > 0:
df['招聘要求'] = jobRequirText,
else:
df['招聘要求'] = jobRequir_a,
# print(df)
print(str(i), '职位信息写入正常')
except:
print(str(i), '职位信息写入异常')
return df
# 招聘公司信息获取
def firmMeessage(html):
df = pd.DataFrame()
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
html.list = html.find_all('div', attrs={'class': 'tCompany_full'})
# print(html.list)
for i, item in enumerate(html.list):
item.list = item.find_all('div', attrs={'class': 'tBorderTop_box'})
# print(item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''))
# for i, item in enumerate(item.list):
# print(item.text,i,sep='')
try:
df['公司信息'] = item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''),
# print(df)
print(str(i), '公司信息写入正常')
except:
print(str(i), '公司信息写入异常')
return df
class writeExcel:
def __init__(self, data):
self.data = data
# print(data)
def wE_r(self):
app = xw.App(visible=False, add_book=False)
new_workbook = xw.Book()
new_worksheet = new_workbook.sheets.add('worksheet')
app.display_alerts = False
app.screen_updating = False
new_worksheet.range('l:l').row_height = 20
new_worksheet.range('l:l').column_width = 11
title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
"招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期', '是否投递']
new_worksheet['A1'].value = title
for i in range(len(self.data)):
new_worksheet.cells[i + 1, 0].value = i + 1
new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
new_worksheet.cells[i + 1, 2].value = data[i]['发布日期']
new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
new_worksheet.cells[i + 1, 6].value = data[i]['公司规模']
new_worksheet.cells[i + 1, 7].value = data[i]['所属行业']
new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
# new_worksheet.cells[i + 1, 9].value = data[i]['招聘要求']
new_worksheet.cells[i + 1, 10].value = data[i]['招聘公司网址']
# new_worksheet.cells[i + 1, 11].value = data[i]['公司信息']
new_worksheet.cells[i + 1, 12].value = data[i]['福利']
# 修改项目
new_worksheet.cells[i + 1, 13].value = key # 关键字
new_worksheet.cells[i + 1, 14].value = '15-40K' if salary == '08%252c09%252c10' else '20-30K' # 薪资范围
new_worksheet.cells[i + 1, 17].value = datetime.date.today() # 薪资范围
print(str(i), 'Excel数据写入正常')
new_worksheet.autofit()
new_workbook.save('jobGain.xlsx')
new_workbook.close()
app.quit()
def run(self):
pf = multiprocessing.Process(target=self.wE_r())
pf.start()
pf.join()
# 单独写入excl
def write_only():
app = xw.App(visible=True, add_book=False)
wb = app.books.open('职业发展-only.xlsx')
# 创建一个worksheet
sh = wb.sheets['前程无忧']
rng_firmMeessage = [i for i in sh.range("k:k").value if i != None] # 单元格内容
rng_jobRequire = [i for i in sh.range("i:i").value if i != None] # 单元格内容
j = sh.range('a1').expand('table').rows.count # 序号
app.display_alerts = False
# app.screen_updating = False
myWeb = Web(job_url) # 实例化类
for i in range(len(rng_jobRequire) - 1):
try:
html = myWeb.web_a(rng_jobRequire[i + 1]) # 获取招聘要求信息
print(rng_jobRequire[i + 1])
df4 = jobRequire(html)
print(df4)
# print(df4.index)
# print(df4.iloc[0,0])
sh.cells[i + 1, 9].value = df4.iloc[0, 0]
print(str(i), '招聘要求写入正常')
html = myWeb.web_b(rng_firmMeessage[i + 1]) # 获取公司信息
print(rng_firmMeessage[i + 1])
df5 = firmMeessage(html)
print(df5)
sh.cells[i + 1, 11].value = df5.iloc[0, 0]
print(str(i), '公司信息写入正常')
except:
print(str(i), "数据查询错误")
# sh.autofit()
wb.save('职业发展-only.xlsx')
wb.close()
app.quit()
class Web:
def __init__(self, url):
self.url = url
def web(self):
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'
driver.back()
time.sleep(0.3)
driver.get(self.url) # 加载网址
time.sleep(1 + 1)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'j_joblist'})
return html.list
# 招聘需求信息获取
def web_a(self, url):
job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
driver.back()
driver.get(job_url) # 加载网址
time.sleep(0.3 + 0.4)
driver.get(url) # 加载网址
time.sleep(1.2 + 1)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html
# 招聘公司信息获取
def web_b(self, url):
job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
driver.back()
driver.get(job_url) # 加载网址
time.sleep(0.5 + 0.5)
driver.get(url) # 加载网址
time.sleep(1.2 + 1)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
# print(html)
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html
key = '配送' # 物流经理#物流运营#物流管理【#运营#物流#数据#运输#仓储#配送】
salary = '08%252c09%252c10' # 08表示1.5-20K,09表示20-30k,08%252c09%252c10表示1.5-20K,20-30K,30K以上
timeday = '3' # 1表示近三天,2表示近一周,近一个月是3
if __name__ == "__main__":
opt = FirefoxOptions() # ChromeOptions() # 创建chrome参数
# 不加载图片
opt.set_preference('permissions.default.image', 2)
opt.headless = False # 显示浏览器
driver = webdriver.Firefox(options=opt) # Chrome(options=opt) # 浏览器实例化
# driver.set_window_size(500, 900)
# options = FirefoxOptions()
# selenium = webdriver.Firefox(options=options)
job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'
# 杭州,2-3万'https://search.51job.com/list/080200,000000,0000,00,9,09,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 杭州1.5-2'https://search.51job.com/list/080200,000000,0000,00,9,08,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# # 招聘需求信息获取
# myWeb = Web(job_url) # 实例化类
# time.sleep(0.2)
# html = myWeb.web_a('https://jobs.51job.com/hangzhou-scq/125683481.html?s=sou_sou_soulb&t=0_0') # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
# # df4 = jobRequire(html) # 获取职位需求信息
# df4 = jobRequire()
# print(df4)
# time.sleep(0.3)
'''
# 取前三页数据
df = pd.DataFrame() # 定义pands整理表格
for i in range(6):
try: # '+str(i+1)+'#08表示1.5-20K,09表示20-30k
print(str(i), '获取第{}页数据'.format(i + 1))
job_url = 'https://search.51job.com/list/080200,000000,0000,21,' + timeday + ',' + salary + ',' + key + ',2,' + str(
i + 1) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=03%252c04&&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare='#03%252c04&大专和本科学历
print(job_url)
'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=03%252c04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加学历degreefrom
'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加薪资
'https://search.51job.com/list/080200,000000,0000,21,3,09,%25E8%25BF%2590%25E8%2590%25A5,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# 'https://search.51job.com/list/080200,000000,0000,00,3,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'j_joblist'})
time1 = time.time() # 计算时长
myWeb = Web(
job_url) # 实例化类 # 'https://jobs.51job.com/hangzhou-yhq/135494019.html?s=sou_sou_soulb&t=0_0') # 实例化网址
time.sleep(1 + 1)
html = myWeb.web()
# print(html)
for i, item in enumerate(html):
# print(item,i,sep=',')
item.list = item.find_all('div', attrs={'class': 'e'}) # 获取每个招聘岗位条目
for i, item in enumerate(item.list):
df1 = jobMesssage(item) # 获取岗位
# print(df1['招聘职位网址'])
df2 = jobFirm(item) # 获取公司
url = str(df1['招聘职位网址'].values).strip("['").strip("']").strip('')
print(url)
url_b = str(df2['招聘公司网址'].values).strip("['").strip("']").strip('')
print(url_b)
'''''''
# 招聘需求信息获取
myWeb = Web(job_url) # 实例化类
time.sleep(0.3)
html = myWeb.web_a(
url) # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
df4 = jobRequire(html) # 获取职位需求信息
print(df4)
time.sleep(0.5 + 0.5 + 0.5)
# 招聘公司信息获取
myWeb = Web(job_url) # 实例化类
time.sleep(0.3)
html = myWeb.web_b(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
df5 = firmMeessage(html) # 获取职位需求信息
print(df5)
time.sleep(0.5 + 0.5 + 0.5)
df3 = pd.concat([df1, df2], axis=1)
df6 = pd.concat([df3, df4], axis=1)
df7 = pd.concat([df5, df6], axis=1)
df7.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
df = pd.concat([df, df7], axis=0)
''''''
df3 = pd.concat([df1, df2], axis=1)
df = pd.concat([df, df3], axis=0)
print(df)
df.to_json('jobGain.json', orient='records', indent=1, force_ascii=False)
time.sleep(0.5 + 0.5 + 0.5)
time.sleep(0.5 + 0.5 + 0.5)
print(str(i), '数据正常'.format(i + 1))
time2 = time.time() # 计算时长
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), '数据异常'.format(i + 1))
'''
'''
# key = '物流管理' # 物流经理#物流运营
# salary = '08' # 08表示1.5-20K,09表示20-30k
with open('jobGain.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run() # 执行多线程
'''
#单独写入excl
write_only()
try: # 关闭后台浏览器
driver.close()
driver.quit()
os.system('taskkill /F /IM chromedriver.exe') # 关闭进程浏览器
sreach_windows = driver.current_window_handle
# 获得当前所有打开的窗口的句柄
all_handles = driver.window_handles
for handle in all_handles:
driver.switch_to.window(handle)
driver.close()
time.sleep(1.2)
except:
print('已完后台毕浏览器')
猎聘网爬取数据:
import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
# 获取招聘职位信息
def jobMesssage(html):
df_jobMesssage = pd.DataFrame()
df = pd.DataFrame()
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'left-list-box'})
for i, item in enumerate(html):
item.list = item.find_all('div', attrs={'class': 'job-detail-box'})
for i, item in enumerate(item.list):
# print(item, i, sep=',')
# print(item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={'class': 'job-salary'}).text,i,sep=',')
try:
df_jobMesssage['招聘职位网址'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).get('href'),
df_jobMesssage['岗位名称'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div', attrs={
'class': 'job-title-box'}).text.strip('').replace('\n', '').replace('\t', ''),
df_jobMesssage['工作地及要求'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div',
attrs={
'class': 'job-labels-box'}).text.strip(
'').replace('\n', '').replace('\t', ''), #
df_jobMesssage['公司名称'] = item.find('div', attrs={'data-nick': 'job-detail-company-info'}).find('div',
attrs={
'class': 'job-company-info-box'}).text.strip(
'').replace('\n', '').replace('\t', '')
df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={
'class': 'job-salary'}).text
# print(df_jobMesssage)
df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
df = pd.concat([df, df_jobMesssage], axis=0)
# df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
# 获取招聘要求和公司信息
def jobRequire(url):
df = {} # 定义字典
# url='https://www.liepin.com/a/29686195.shtml?d_sfrom=search_prime&d_ckId=c8f01cee484fdfafc8e1e5d047a1e1d1&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=38'
cookie = 'Cookie: __uuid=1632571874000.95; __s_bid=11011704223d5f9c92ff5bd3e81bc8334a74; __tlog=1632611231431.79%7C00000000%7C00000000%7C00000000%7C00000000; Hm_lvt_a2647413544f5a04f00da7eee0d5e200=1632571900,1632611231; Hm_lpvt_a2647413544f5a04f00da7eee0d5e200=1632615070; __session_seq=12; __uv_seq=12'
headers = {
'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
'Cookie': cookie,
'Connection': 'keep - alive'
}
# 新闻链接
# session = requests.session()
res = requests.get(url=url, headers=headers, timeout=30)
res.encoding = 'utf-8'
res.raise_for_status()
res.encoding = res.apparent_encoding
html = BeautifulSoup(res.text, 'html.parser')
time.sleep(0.1)
# print(html)
# 存入本地
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(res.text)
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
html.list = html.find_all('content') # 整体框架
for i, item in enumerate(html):
# item.list = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text#上级框架
# print(item.list)
try:
df['招聘要求'] = item.find_all('section', attrs={'class': 'job-intro-container'})[0].text.strip('\n'),
df['公司信息'] = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text.strip('\n'),
# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
print(df)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
class Web:
def __init__(self, url):
self.url = url
# 获取招聘职位信息
def web(self):
driver.back()
time.sleep(0.3)
driver.get(self.url) # 加载网址
time.sleep(1)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'left-list-box'})
# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
# 获取招聘要求和公司信息
def web_a(self, url):
driver.back()
time.sleep(0.3)
driver.get(url) # 加载网址
time.sleep(1)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('content') # 整体框架
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
class writeExcel:
def __init__(self, data):
self.data = data
# print(data)
def wE_r(self):
app = xw.App(visible=False, add_book=False)
new_workbook = xw.Book()
new_worksheet = new_workbook.sheets.add('worksheet')
app.display_alerts = False
app.screen_updating = False
title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
"招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
new_worksheet['A1'].value = title
for i in range(len(self.data)):
try:
df_w = jobRequire(data[i]['招聘职位网址'])
print(data[i]['招聘职位网址'])
# if i%9==8:
# time.sleep(20)#每取8个停下8秒应对反扒
# else:
# time.sleep(0.2)
new_worksheet.cells[i + 1, 0].value = i + 1
new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
new_worksheet.cells[i + 1, 2].value = '' # data[i]['发布日期']
new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
new_worksheet.cells[i + 1, 6].value = '' # data[i]['公司规模']
new_worksheet.cells[i + 1, 7].value = '' # data[i]['所属行业']
new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
new_worksheet.cells[i + 1, 9].value = df_w[
'招聘要求'] # str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 10].value = '' # data[i]['招聘公司网址']
new_worksheet.cells[i + 1, 11].value = df_w[
'公司信息'] # str(df_w['公司信息'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 12].value = '' # data[i]['福利']
# 修改项目
new_worksheet.cells[i + 1, 13].value = key # 关键字
new_worksheet.cells[i + 1, 14].value = salary # 薪资范围
new_worksheet.cells[i + 1, 17].value = datetime.date.today() # 薪资范围
print(str(i), 'Excel数据写入正常')
except:
print(str(i), 'Excel数据写入异常')
# new_worksheet.autofit()
new_workbook.save('jobliepin.xlsx')
new_workbook.close()
app.quit()
def run(self):
pf = multiprocessing.Process(target=self.wE_r())
pf.start()
pf.join()
df=pd.DataFrame()#定义 全局变量
key='配送'#物流经理#物流运营【#运营#物流#数据#运输#仓储#配送】
salary='15$30'#20$40#10$20【15$30】
timeday='7'#7表示一周内,3表示3天内,30表示近一个月
if __name__ == "__main__":
# jobRequire()
opt = ChromeOptions() # 创建chrome参数
opt.headless = False # 显示浏览器
driver = Chrome(options=opt) # 浏览器实例化
# driver=webdriver.Chrome()
# driver.set_window_size(300, 700)
for i in range(6): # +str(i);key=
try:
print(str(i), '获取第{}页数据'.format(i + 1))
job_url = 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=' + str(
key) + '&industry=9$250&dq=070020&salary=' + salary + '&pubTime=' + timeday + '¤tPage=' + str(i)
print(job_url)
'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=l07j4hoqgyh0gdr1cskm5ur0c8umz86a&oldCkId=84c318d34f244edd65090ab5353419c3&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=9$250&dq=070020&salary=15$30&pubTime=7&customSalary=1¤tPage=1&scene=page'
'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=6aa2zbc9ptmwb1w7909zc2vm047p6uib&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=&dq=070020&salary=15$30&pubTime=7&customSalary=1&scene=condition'
'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&ckId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=20$40&pubTime=3¤tPage=1'
'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=10$20&pubTime=3'
'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=%E7%89%A9%E6%B5%81%E7%AE%A1%E7%90%86&dq=070020&salary=20$40&pubTime=3'
# job_url_a='https://www.liepin.com/a/30216633.shtml?d_sfrom=search_prime&d_ckId=10e193c94fdc8095c14815c02246e6e7&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=2'
time1 = time.time() # 计算时长
# 获取招聘职位信息
myWeb = Web(job_url) # 实例化类
html = myWeb.web() # 招聘要求和公司信息
time.sleep(0.5)
# print(html)
df1 = jobMesssage(html)
df = pd.concat([df1, df], axis=0)
df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
time2 = time.time() # 计算时长
print(str(i), '数据正常'.format(i + 1))
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), '数据异常'.format(i + 1))
# 写入excel
with open('jobliepin.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run() # 执行多线程
try: # 关闭后台浏览器
driver.close()
driver.quit()
os.system('taskkill /F /IM chromedriver.exe') # 关闭进程浏览器
sreach_windows = driver.current_window_handle
# 获得当前所有打开的窗口的句柄
all_handles = driver.window_handles
for handle in all_handles:
driver.switch_to.window(handle)
driver.close()
time.sleep(1.2)
except:
print('已完后台毕浏览器')
BOss直聘网站:
import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
def jobMesssage(html):
df_jobMesssage = pd.DataFrame()
df = pd.DataFrame()
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-list'})
# print(html.list)
for i, item in enumerate(html):
item.list = item.find_all('div', attrs={'class': 'job-primary'})
# print(item,i,sep=',')
for i, item in enumerate(item.list): # 获取每个招聘条目
# print(item, i, sep=',')
try:
item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
'\n', ' ')
print(item.list, i, sep=',')
df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
attrs={'class': 'primary-box'}).get(
'href'),
df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-name'}).text,
df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-area-wrapper'}).text.strip('\n'), #
df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
'').replace(
'\n', ' '),
df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
'\n', ' '),
df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
'').replace(
'\n', ' '),
# print(df_jobMesssage)
df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
df = pd.concat([df, df_jobMesssage], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
print(str(i), '公司信息写入正常')
except:
print(str(i), '公司信息写入异常')
return df
def jobRequire(html):
# df = pd.DataFrame()
df = {} # 定义字典
# # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
# url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
# # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
# cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
# # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
# # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
# headers = {
# 'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
# 'Cookie': cookie,
# 'Connection': 'keep - alive',
# 'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
# }
# # 新闻链接
# # session = requests.session()
# res = requests.get(url=url, headers=headers, timeout=30)
# res.encoding = 'utf-8'
# res.raise_for_status()
# res.encoding = res.apparent_encoding
# html = BeautifulSoup(res.text, 'html.parser')
# time.sleep(3)
# print(html)
# # 存入本地
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(res.text)
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
for i, item in enumerate(html):
# print(item,1,sep=',')
item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
print(item.list, i, sep=',')
try:
df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
'\n', ' ').replace('\r', ' ').replace('\t', ' '), # 上级框架,
df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
''),
# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# print(df)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入异常')
return df
class writeExcel:
def __init__(self, data):
self.data = data
# print(data)
def wE_r(self):
app = xw.App(visible=False, add_book=False)
new_workbook = xw.Book()
new_worksheet = new_workbook.sheets.add('worksheet')
app.display_alerts = False
app.screen_updating = False
title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
"招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期','是否投递']
new_worksheet['A1'].value = title
new_worksheet.range('l:l').row_height=20
new_worksheet.range('l:l').column_width=11
for i in range(len(self.data)):
try:
# df_w = jobRequire(data[i]['招聘职位网址'])
# print(data[i]['招聘职位网址'])
new_worksheet.cells[i + 1, 0].value = i + 1
new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
new_worksheet.cells[i + 1, 2].value = '' # data[i]['发布日期']
new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
new_worksheet.cells[i + 1, 6].value = '' # data[i]['公司规模']
new_worksheet.cells[i + 1, 7].value = '' # data[i]['所属行业']
new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
# new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 10].value = '' # data[i]['招聘公司网址']
# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 12].value = '' # data[i]['福利']
# 修改项目
new_worksheet.cells[i + 1, 13].value = key # 关键字
new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K' # 薪资范围
new_worksheet.cells[i + 1, 17].value = datetime.date.today() # 薪资范围
print(str(i), 'Excel数据写入正常')
except:
print(str(i), 'Excel数据写入异常')
# 招聘公司信息获取
for i in range(len(self.data)):
try:
# 招聘公司信息获取
time1 = time.time() # 计算时长
myWeb = Web(url) # 实例化类
time.sleep(0.5+1)
html = myWeb.web_a(data[i]['招聘职位网址'],i) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
df_w = jobRequire(html) # 获取职位需求信息
print(df_w)
time.sleep(2)
new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
print(str(i), 'Excel数据-2模块写入正常')
time2 = time.time() # 计算时长
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), 'Excel数据-2模块写入异常')
# new_worksheet.autofit()
new_workbook.save('jobBoss.xlsx')
new_workbook.close()
app.quit()
def run(self):
pf = multiprocessing.Process(target=self.wE_r())
pf.start()
pf.join()
# 单独写入excl
def write_only():
app = xw.App(visible=True, add_book=False)
wb = app.books.open('职业发展-only.xlsx')
# 创建一个worksheet
sh = wb.sheets['Boss']
rng_jobRequire = [i for i in sh.range("i:i").value if i != None] # 单元格内容
j = sh.range('a1').expand('table').rows.count # 序号
app.display_alerts = False
# app.screen_updating = False
myWeb = Web(url) # 实例化类
for i in range(len(rng_jobRequire) - 1):
try:
html = myWeb.web_a(rng_jobRequire[i + 1],i) # 获取招聘要求信息
print(rng_jobRequire[i + 1])
df_w = jobRequire(html)
print(df_w)
# print(df4.index)
# print(df4.iloc[0,0])
sh.cells[i + 1, 9].value = df_w['招聘要求']
sh.cells[i + 1, 11].value = df_w['公司信息']
# print(str(i), '招聘要求写入正常')
except:
print(str(i), "数据查询错误")
# sh.autofit()
wb.save('职业发展-only.xlsx')
wb.close()
app.quit()
class Web:
def __init__(self, url):
self.url = url
# 获取招聘职位信息
def web(self):
# driver.get('https://www.baidu.com/') # 加载网址
# driver.refresh()
# driver.get('https://www.baidu.com/') # 加载网址
# driver.back()
# driver.refresh()
# time.sleep(0.5+0.5)
driver.get(self.url) # 加载网址
time.sleep(1.5+0.5+0.5+0.5)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-list'})
# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
# 获取招聘要求和公司信息
def web_a(self, url,i):
driver.get('https://www.baidu.com/') # 加载网址
driver.refresh()
driver.get('https://www.baidu.com/') # 加载网址
driver.back()
# driver.refresh()
# print('回退刷新')
time.sleep(0.5+1)
driver.get(url) # 加载网址
# driver.refresh()
# print('刷新')
time.sleep(1)
#隔8次刷新一次
if i%8==7:time.sleep(13),print('刷新{}次'.format(i))# driver.refresh(),
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
print('当前运行条目数',i)
return html.list
df = pd.DataFrame() # 定义 salary 全局变量
key = '数据' # # 物流经理#物流运营#物流管理【#运营#物流#数据#运输#仓储#配送】
salary = '6' # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
# jobMesssage()
# jobRequire()
# opt = ChromeOptions() # 创建chrome参数
# opt.headless = False # 显示浏览器
# driver = Chrome(options=opt) # 浏览器实例化
# # driver=webdriver.Chrome()
# driver.set_window_size(300, 700)
# url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
#'https://m.zhipin.com/job_detail/?city=101280600&source=10&query=%E6%9D%AD%E5%B7%9E'
# url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df5 = jobRequire(html) # 获取职位需求信息
# print(df5)
# time.sleep(0.5)
opt = ChromeOptions() # 创建chrome参数
# 不加载图片
# prefs = {"profile.managed_default_content_settings.images": 2}
# opt.add_experimental_option("prefs", prefs)
opt.headless = False # 显示浏览器
driver = Chrome(options=opt) # 浏览器实例化
# driver=webdriver.Chrome()
# driver.set_window_size(300, 700)
'''
opt = FirefoxOptions() # ChromeOptions()
#加载图片
# opt.headless = False # 显示浏览器
# driver = webdriver.Firefox(options=opt) # Chrome(options=opt) # 浏览器实例化
# # driver.set_window_size(400, 900)
# 不加载图片
opt.set_preference('permissions.default.image', 2)
opt.headless = False # 显示浏览器
driver = webdriver.Firefox(options=opt) # Chrome(options=opt) # 浏览器实例化
'''
url = 'https://www.zhipin.com/i100502-c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
'''
for i in range(6): # +str(i);key=
try:
print(str(i), '获取第{}页数据'.format(i + 1))
url = 'https://www.zhipin.com/i100502-c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
i + 1) + '&ka=page-' + str(i + 1)
print(url)
# 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
# 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
# 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
# ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
time1 = time.time() # 计算时长
# 获取招聘职位信息
myWeb = Web(url)
html = myWeb.web() # 获取招聘岗位信息
# html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
time.sleep(0.5+1+1+0.5+0.5+0.5+1)
# print(html)
df1 = jobMesssage(html)
df = pd.concat([df1, df], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
# print(url_b)
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df2 = jobRequire(html) # 获取职位需求信息
# print(df2)
# time.sleep(0.5)
#
# df3 = pd.concat([df1, df2], axis=1)
# df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df = pd.concat([df, df3], axis=0)
# print(df)
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# time.sleep(0.5)
time2 = time.time() # 计算时长
print(str(i), '数据正常'.format(i + 1))
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), '数据异常'.format(i + 1))
# 写入excel
with open('jobBoss.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run() # 执行多线程
'''
#单独写入excel
write_only()
try: # 关闭后台浏览器
driver.close()
driver.quit()
os.system('taskkill /F /IM chromedriver.exe') # 关闭进程浏览器
sreach_windows = driver.current_window_handle
# 获得当前所有打开的窗口的句柄
all_handles = driver.window_handles
for handle in all_handles:
driver.switch_to.window(handle)
driver.close()
time.sleep(1.2)
except:
print('已完后台毕浏览器')
boss完整代码2-单独提取职位要求和公司信息:
import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
def jobMesssage(html):
df_jobMesssage = pd.DataFrame()
df = pd.DataFrame()
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-list'})
# print(html.list)
for i, item in enumerate(html):
item.list = item.find_all('div', attrs={'class': 'job-primary'})
# print(item,i,sep=',')
for i, item in enumerate(item.list): # 获取每个招聘条目
# print(item, i, sep=',')
try:
item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
'\n', ' ')
print(item.list, i, sep=',')
df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
attrs={'class': 'primary-box'}).get(
'href'),
df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-name'}).text,
df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-area-wrapper'}).text.strip('\n'), #
df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
'').replace(
'\n', ' '),
df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
'\n', ' '),
df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
'').replace(
'\n', ' '),
# print(df_jobMesssage)
df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
df = pd.concat([df, df_jobMesssage], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
def jobRequire(html):
# df = pd.DataFrame()
df = {} # 定义字典
# # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
# url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
# # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
# cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
# # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
# # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
# headers = {
# 'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
# 'Cookie': cookie,
# 'Connection': 'keep - alive',
# 'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
# }
# # 新闻链接
# # session = requests.session()
# res = requests.get(url=url, headers=headers, timeout=30)
# res.encoding = 'utf-8'
# res.raise_for_status()
# res.encoding = res.apparent_encoding
# html = BeautifulSoup(res.text, 'html.parser')
# time.sleep(3)
# print(html)
# # 存入本地
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(res.text)
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
for i, item in enumerate(html):
# print(item,1,sep=',')
item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
print(item.list, i, sep=',')
try:
df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
'\n', ' ').replace('\r', ' ').replace('\t', ' '), # 上级框架,
df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
''),
# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# print(df)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
class writeExcel:
def __init__(self, data):
self.data = data
# print(data)
def wE_r(self):
app = xw.App(visible=False, add_book=False)
new_workbook = xw.Book()
new_worksheet = new_workbook.sheets.add('worksheet')
app.display_alerts = False
app.screen_updating = False
title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
"招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
new_worksheet['A1'].value = title
for i in range(len(self.data)):
try:
# df_w = jobRequire(data[i]['招聘职位网址'])
# print(data[i]['招聘职位网址'])
new_worksheet.cells[i + 1, 0].value = i + 1
new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
new_worksheet.cells[i + 1, 2].value = '' # data[i]['发布日期']
new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
new_worksheet.cells[i + 1, 6].value = '' # data[i]['公司规模']
new_worksheet.cells[i + 1, 7].value = '' # data[i]['所属行业']
new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
# new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 10].value = '' # data[i]['招聘公司网址']
# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 12].value = '' # data[i]['福利']
# 修改项目
new_worksheet.cells[i + 1, 13].value = key # 关键字
new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == 6 else '15-20K' # 薪资范围
new_worksheet.cells[i + 1, 17].value = datetime.date.today() # 薪资范围
print(str(i), 'Excel数据写入正常')
except:
print(str(i), 'Excel数据写入异常')
# 招聘公司信息获取
# for i in range(len(self.data)):
# try:
# # 招聘公司信息获取
# time1 = time.time() # 计算时长
# myWeb = Web(url) # 实例化类
# time.sleep(0.5)
# html = myWeb.web_a(data[i]['招聘职位网址']) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df_w = jobRequire(html) # 获取职位需求信息
# print(df_w)
# time.sleep(2.5)
# new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
# print(str(i), 'Excel数据-2模块写入正常')
# time2 = time.time() # 计算时长
# print('总耗时:{}'.format(time2 - time1))
# except:
# print(str(i), 'Excel数据-2模块写入异常')
new_worksheet.autofit()
new_workbook.save('jobBoss.xlsx')
new_workbook.close()
app.quit()
def wE_r_a(self):
app = xw.App(visible=True, add_book=False)
wb=app.books.open('jobBoss.xlsx')
sh=wb.sheets['worksheet']
# print(sh.range('i2').value)
rng = [i for i in sh.range("i:i").value if i != None]# 单元格内容招聘网址
print(rng)
# j = sh.range('a1').expand('table').rows.count
# print(j)
app.display_alerts = False
app.screen_updating = False
for i in range(len(rng) - 1):
try:
# 招聘公司信息获取
time1 = time.time() # 计算时长
myWeb = Web(url) # 实例化类
time.sleep(0.5)
html = myWeb.web_a(rng[i+1]) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
df_w = jobRequire(html) # 获取职位需求信息
print(df_w)
time.sleep(2.5)
sh.cells[i + 1, 9].value = df_w['招聘要求']
sh.cells[i + 1, 11].value = df_w['公司信息']
print(str(i), 'Excel数据-2模块写入正常')
time2 = time.time() # 计算时长
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), 'Excel数据-2模块写入异常')
sh.autofit()
wb.save('jobBoss.xlsx')
wb.close()
app.quit()
def run(self):
pf = multiprocessing.Process(target=self.wE_r())
pf.start()
pf.join()
def run_a(self):
pf = multiprocessing.Process(target=self.wE_r_a())
pf.start()
pf.join()
class Web:
def __init__(self, url):
self.url = url
# 获取招聘职位信息
def web(self):
driver.back()
# driver.refresh()
time.sleep(0.5)
driver.get(self.url) # 加载网址
time.sleep(1.5)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-list'})
# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
# 获取招聘要求和公司信息
def web_a(self, url):
driver.back()
# driver.refresh()
time.sleep(0.5)
driver.get(url) # 加载网址
time.sleep(1.5)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
df = pd.DataFrame() # 定义 全局变量
key = '物流管理' # 物流经理#物流运营
salary = '6' # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
# jobMesssage()
# jobRequire()
# opt = ChromeOptions() # 创建chrome参数
# opt.headless = False # 显示浏览器
# driver = Chrome(options=opt) # 浏览器实例化
# # driver=webdriver.Chrome()
# driver.set_window_size(300, 700)
# url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
# url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df5 = jobRequire(html) # 获取职位需求信息
# print(df5)
# time.sleep(0.5)
opt = ChromeOptions() # 创建chrome参数
# 不加载图片
prefs = {"profile.managed_default_content_settings.images": 2}
opt.add_experimental_option("prefs", prefs)
opt.headless = False # 显示浏览器
driver = Chrome(options=opt) # 浏览器实例化
# driver=webdriver.Chrome()
driver.set_window_size(300, 700)
url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
'''
for i in range(3): # +str(i);key=
try:
print(str(i), '获取第{}页数据'.format(i + 1))
url='https://www.zhipin.com/c101210100/y_'+salary+'/?query='+key+'&city=101210100&industry=&position=&ka=sel-salary-'+salary+'&page='+str(i+1)+'&ka=page-'+str(i+1)
print(url)
#'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
#'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
#'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
#‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
time1 = time.time() # 计算时长
# 获取招聘职位信息
myWeb=Web(url)
html=myWeb.web()#获取招聘岗位信息
# html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
time.sleep(0.5)
# print(html)
df1 = jobMesssage(html)
df = pd.concat([df1, df], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
# print(url_b)
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df2 = jobRequire(html) # 获取职位需求信息
# print(df2)
# time.sleep(0.5)
#
# df3 = pd.concat([df1, df2], axis=1)
# df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df = pd.concat([df, df3], axis=0)
# print(df)
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# time.sleep(0.5)
time2 = time.time() # 计算时长
print(str(i), '数据正常'.format(i + 1))
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), '数据异常'.format(i + 1))
# 写入excel
with open('jobBoss.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run() # 执行多线程
'''
# 写入excel_a
with open('jobBoss.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run_a() # 执行多线程
try: # 关闭后台浏览器
driver.close()
driver.quit()
os.system('taskkill /F /IM chromedriver.exe') # 关闭进程浏览器
sreach_windows = driver.current_window_handle
# 获得当前所有打开的窗口的句柄
all_handles = driver.window_handles
for handle in all_handles:
driver.switch_to.window(handle)
driver.close()
time.sleep(1.2)
except:
print('已完后台毕浏览器')
对应招聘信息单独查询写入excel
import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os
def jobMesssage(html):
df_jobMesssage = pd.DataFrame()
df = pd.DataFrame()
# with open('jobhtml.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-list'})
# print(html.list)
for i, item in enumerate(html):
item.list = item.find_all('div', attrs={'class': 'job-primary'})
# print(item,i,sep=',')
for i, item in enumerate(item.list): # 获取每个招聘条目
# print(item, i, sep=',')
try:
item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
'\n', ' ')
print(item.list, i, sep=',')
df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
attrs={'class': 'primary-box'}).get(
'href'),
df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-name'}).text,
df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
'class': 'job-area-wrapper'}).text.strip('\n'), #
df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
'').replace(
'\n', ' '),
df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
'\n', ' '),
df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
'').replace(
'\n', ' '),
# print(df_jobMesssage)
df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
df = pd.concat([df, df_jobMesssage], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
def jobRequire(html):
# df = pd.DataFrame()
df = {} # 定义字典
# # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
# url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
# # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
# cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
# # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
# # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
# headers = {
# 'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
# 'Cookie': cookie,
# 'Connection': 'keep - alive',
# 'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
# }
# # 新闻链接
# # session = requests.session()
# res = requests.get(url=url, headers=headers, timeout=30)
# res.encoding = 'utf-8'
# res.raise_for_status()
# res.encoding = res.apparent_encoding
# html = BeautifulSoup(res.text, 'html.parser')
# time.sleep(3)
# print(html)
# # 存入本地
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(res.text)
# with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
# html = BeautifulSoup(f, 'html.parser')
# html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
for i, item in enumerate(html):
# print(item,1,sep=',')
item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
print(item.list, i, sep=',')
try:
df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
'\n', ' ').replace('\r', ' ').replace('\t', ' '), # 上级框架,
df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
''),
# df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# print(df)
print(str(i), '招聘职位写入正常')
except:
print(str(i), '招聘职位写入正常')
return df
class writeExcel:
def __init__(self, data):
self.data = data
# print(data)
def wE_r(self):
app = xw.App(visible=False, add_book=False)
new_workbook = xw.Book()
new_worksheet = new_workbook.sheets.add('worksheet')
app.display_alerts = False
app.screen_updating = False
title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
"招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
new_worksheet['A1'].value = title
for i in range(len(self.data)):
try:
# df_w = jobRequire(data[i]['招聘职位网址'])
# print(data[i]['招聘职位网址'])
new_worksheet.cells[i + 1, 0].value = i + 1
new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
new_worksheet.cells[i + 1, 2].value = '' # data[i]['发布日期']
new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
new_worksheet.cells[i + 1, 6].value = '' # data[i]['公司规模']
new_worksheet.cells[i + 1, 7].value = '' # data[i]['所属行业']
new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
# new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 10].value = '' # data[i]['招聘公司网址']
# new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
new_worksheet.cells[i + 1, 12].value = '' # data[i]['福利']
# 修改项目
new_worksheet.cells[i + 1, 13].value = key # 关键字
new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K' # 薪资范围
new_worksheet.cells[i + 1, 17].value = datetime.date.today() # 薪资范围
print(str(i), 'Excel数据写入正常')
except:
print(str(i), 'Excel数据写入异常')
# 招聘公司信息获取
for i in range(len(self.data)):
try:
# 招聘公司信息获取
time1 = time.time() # 计算时长
myWeb = Web(url) # 实例化类
time.sleep(0.5)
html = myWeb.web_a(data[i]['招聘职位网址']) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
df_w = jobRequire(html) # 获取职位需求信息
print(df_w)
time.sleep(3)
new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
print(str(i), 'Excel数据-2模块写入正常')
time2 = time.time() # 计算时长
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), 'Excel数据-2模块写入异常')
new_worksheet.autofit()
new_workbook.save('jobBoss.xlsx')
new_workbook.close()
app.quit()
def run(self):
pf = multiprocessing.Process(target=self.wE_r())
pf.start()
pf.join()
class Web:
def __init__(self, url):
self.url = url
# 获取招聘职位信息
def web(self):
driver.back()
# driver.refresh()
time.sleep(0.5)
driver.get(self.url) # 加载网址
time.sleep(1.5)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-list'})
# with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
# 获取招聘要求和公司信息
def web_a(self, url):
driver.back()
# driver.refresh()
print('回退刷新')
time.sleep(0.5)
driver.get(url) # 加载网址
# driver.refresh()
# print('刷新')
time.sleep(2)
source = driver.page_source # 页面内容实例化
html = BeautifulSoup(source, 'html.parser') # 获取页面内容
html.list = html.find_all('div', attrs={'class': 'job-detail'}) # 整体框架
# with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
# f.write(source)
# print(html)
return html.list
df = pd.DataFrame() # 定义 salary 全局变量
key = '物流管理' # 物流经理#物流运营
salary = '5' # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
# jobMesssage()
# jobRequire()
# opt = ChromeOptions() # 创建chrome参数
# opt.headless = False # 显示浏览器
# driver = Chrome(options=opt) # 浏览器实例化
# # driver=webdriver.Chrome()
# driver.set_window_size(300, 700)
# url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
# url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df5 = jobRequire(html) # 获取职位需求信息
# print(df5)
# time.sleep(0.5)
opt = ChromeOptions() # 创建chrome参数
# 不加载图片
prefs = {"profile.managed_default_content_settings.images": 2}
opt.add_experimental_option("prefs", prefs)
opt.headless = False # 显示浏览器
driver = Chrome(options=opt) # 浏览器实例化
# driver=webdriver.Chrome()
driver.set_window_size(300, 700)
url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
for i in range(3): # +str(i);key=
try:
print(str(i), '获取第{}页数据'.format(i + 1))
url = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
i + 1) + '&ka=page-' + str(i + 1)
print(url)
# 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
# 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
# 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
# ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
time1 = time.time() # 计算时长
# 获取招聘职位信息
myWeb = Web(url)
html = myWeb.web() # 获取招聘岗位信息
# html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
time.sleep(0.5)
# print(html)
df1 = jobMesssage(html)
df = pd.concat([df1, df], axis=0)
df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
# print(url_b)
# # 招聘公司信息获取
# myWeb = Web(url) # 实例化类
# time.sleep(0.3)
# html = myWeb.web_a(url_b) # 'https://jobs.51job.com/all/co3836624.html') # 实例化网址
# df2 = jobRequire(html) # 获取职位需求信息
# print(df2)
# time.sleep(0.5)
#
# df3 = pd.concat([df1, df2], axis=1)
# df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
# df = pd.concat([df, df3], axis=0)
# print(df)
# df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
# time.sleep(0.5)
time2 = time.time() # 计算时长
print(str(i), '数据正常'.format(i + 1))
print('总耗时:{}'.format(time2 - time1))
except:
print(str(i), '数据异常'.format(i + 1))
# 写入excel
with open('jobBoss.json', 'r', encoding='utf-8') as f:
data = json.load(f)
# print(data)
myWe = writeExcel(data) # 写入excel
myWe.run() # 执行多线程
try: # 关闭后台浏览器
driver.close()
driver.quit()
os.system('taskkill /F /IM chromedriver.exe') # 关闭进程浏览器
sreach_windows = driver.current_window_handle
# 获得当前所有打开的窗口的句柄
all_handles = driver.window_handles
for handle in all_handles:
driver.switch_to.window(handle)
driver.close()
time.sleep(1.2)
except:
print('已完后台毕浏览器')
|