爬虫代码
import requests
from lxml import etree
import json
import time
def spider_job(href):
header = {
"user-agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36"
}
response = requests.get(href, headers=header, timeout=5)
return response.content
def analysis_content(content, filename):
html = etree.HTML(content)
job = html.xpath('//script[@type="text/javascript"]/text()')
job = job[0].replace('\r\n', '')
job = job.replace('\\', '')
job = job[job.find('=') + 1:]
job = job[:job.rfind(',"jobid_count"')] + '}'
job = '{"name":"' + filename + '","content":' + job + "}"
job_1 = json.loads(job)
if not len(job_1['content']['engine_jds']):
return False
with open('job_file/{}.json'.format(filename), 'w', encoding='utf-8') as f:
f.write(job)
print('文件 {} 写入成功'.format(filename))
return True
def run():
search = ['web', 'java', '银行', '移动开发', '通信技术开发及应用', '国企', '事业单位', '政府机关', '上市公司']
province = [
{
'name': '贵州',
'id': '260200'
}, {
'name': '重庆',
'id': '060100'
}, {
'name': '四川',
'id': '090201'
}, {
'name': '云南',
'id': '250200'
}, {
'name': '湖南',
'id': '190200'
}, {
'name': '广西',
'id': '140200'
}]
url = 'https://search.51job.com/list/{},000000,0000,00,9,99,{},2,{}.html?workyear=01'
for i in search:
for v in province:
page = 1
err = 0
while True:
try:
response = spider_job(url.format(v['id'], i, page))
flag = analysis_content(response, '{}_{}_{}'.format(i, v['name'], page))
time.sleep(5)
if flag:
page += 1
else:
break
except:
err += 1
if err >= 3:
print('错误 {} 次, 已重新更换链接'.format(err))
break
print('错误 {} 次'.format(err))
if __name__ == "__main__":
run()
数据筛选并写入Excel
import os
import json
import openpyxl
def jobCollect(content):
obj_arr = []
for obj in content["engine_jds"]:
job_name = obj['job_name']
job_href = obj['job_href']
providesalary_text = obj['providesalary_text']
company_name = obj['company_name']
issuedate = obj['issuedate']
workarea_text = obj["workarea_text"]
companytype_text = obj['companytype_text']
obj_arr.append({
"job_name": job_name,
"job_href": job_href,
"providesalary_text": providesalary_text,
"company_name": company_name,
"issuedate": issuedate,
'workarea_text': workarea_text,
'companytype_text': companytype_text
})
return obj_arr
def run():
files = os.listdir('51job/job_file')
for file in files:
with open('51job/job_file/{}'.format(file), encoding='utf-8') as f:
job_json = json.loads(f.read())
job_list = job_json['content']
job_linchpin = jobCollect(job_list)
sheet_name = file[:file.rfind('_')]
title = ['岗位', '工作链接', '工资', '公司名称', '发布时间', '工作地点', '公司类型']
wb = openpyxl.load_workbook('51job.xlsx')
sheet_exist = wb.sheetnames
if sheet_name in sheet_exist:
sheet = wb[sheet_name]
row = len(list(sheet))
for i in range(len(job_linchpin)):
col = 1
row += 1
for k in job_linchpin[i].keys():
sheet.cell(row, col, job_linchpin[i][k])
col += 1
else:
print("追加成功")
else:
sheet = wb.create_sheet(sheet_name)
for i in range(len(title)):
sheet.cell(1, i + 1, title[i])
for i in range(len(job_linchpin)):
col = 1
for k in job_linchpin[i].keys():
sheet.cell(i + 2, col, job_linchpin[i][k])
col += 1
else:
print("写入成功")
wb.save('51job.xlsx')
wb.close()
if __name__ == "__main__":
run()
|