介绍
该项目为课程期末作业,python三个月的学习成果展示,分析各地职位薪水的多少
结果展示
可视化结果: excel结果: json结果: mysql:
源代码上传到Gitee: https://gitee.com/ren-wenqing/PythonLearning.git
爬取的网页
https://search.51job.com/
参数
例:https://search.51job.com/list/090000,000000,0000,00,9,99,java,2,79.html 090000:城市代码 java:关键词 79:页数
具体参数可以自己在https://search.51job.com/中的搜索框中输入数据,观察浏览器地址的变化
运行环境
python:3.9.6 mysql:8.0.18 需要用到的库:
import collections
import json
import xlsxwriter as xlsxwriter
from utils import get_header
import requests
import queue
import threading
import os
import csv
import re
import pymysql
import matplotlib.pyplot as plt
from openpyxl import load_workbook
from faker import Faker
项目分析
简单爬取
直接向网页发送请求就可以获取到网页的源代码
req=requests.get(url=url, headers=self.header)
获取结果: 这里页面中的加载框一直是加载状态,并没有我们需要的数据,但爬取的网址在浏览器中却可以正常显示:
这是因为requests库无法解析js,页面源代码中可能包含招聘列表 查看页面源代码,可以看到有一行特别长的js 复制到解析工具: 都是我们需要的数据 这里直接用正则匹配最快
def Spider(self):
while not self.pagequeue.empty():
url = self.pagequeue.get()
print('正在爬取:{}'.format(url))
req = requests.get(url, headers=get_header())
req.encoding = 'gbk'
response = req.content.decode('gbk')
for i in range(1, 12):
try:
title = re.findall(r'"job_title":"(.*?)"', response)
if title[0] == None:
break
company_name = re.findall(r'"company_name":"(.*?)"', response)
job_name = re.findall(r'"job_title":"(.*?)"', response)
salary = re.findall(r'"providesalary_text":"(.*?)"', response)
work_area = re.findall(r'"workarea_text":"(.*?)"', response)
time = re.findall(r'"issuedate":"(.*?)"', response)
company_type = re.findall(r'"companytype_text":"(.*?)"', response)
company_hangye = re.findall(r'"companyind_text":"(.*?)"', response)
detail = re.findall(r'"job_href":"(.*?)"', response)
if len(salary[i]) != 0:
salary = salary[i].replace('\\/', '/')
salary=formatyue(salary)
salary = formatqian(salary)
salary=fomatpjun(salary)
detail = detail[i].replace('\\/', '/')
company_hangye = company_hangye[i].replace('\\/', '/')
data = {
"职位名称": job_name[i],
"薪资(万/月)": salary,
"公司名称": company_name[i],
"工作地点": work_area[i],
"发布时间": time[i],
"公司链接": detail,
"公司类型": company_type[i],
"公司行业": company_hangye,
}
self.jobqueue.put(data)
except:
continue
这些是我debug后显示的数据,可以看到里面有很多转义“\”符号,需要剔除 我定义了三个方法对薪水解析统一格式,因为有的薪水是(千/月),有的是(万/月),还有的是(万/年)
def formatyue(str):
sa = re.findall(r'(.*?)万/年', str)
if len(sa) ==0:
return str
else:
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])
sal = float(sa1[0][1])
saf = saf / 12
sal = sal / 12
str = "{}-{}万/月".format(round(saf, 1), round(sal, 1))
return str
def formatqian(str):
sa = re.findall(r'(.*?)千/月', str)
if len(sa) ==0:
return str
else:
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])
sal = float(sa1[0][1])
saf = saf / 10
sal = sal / 10
str = "{}-{}万/月".format(round(saf, 1), round(sal, 1))
return str
def fomatpjun(str):
sa = re.findall(r'(.*?)万/月', str)
sa1 = re.findall(r'(.*?)-(.*)', sa[0])
saf = float(sa1[0][0])+float(sa1[0][1])
str = "{}".format(round(saf/2, 1))
return str
多线程爬取
加快爬取速度
def run(self):
thread_list = []
for i in range(self.thread):
t = threading.Thread(target=self.Spider)
thread_list.append(t)
for t in thread_list:
t.setDaemon(True)
t.start()
for t in thread_list:
t.join()
地址字段输入的是文字,但浏览器里是地址的数字代码,需要遍历地址对应的json 链接: https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js.
def _get_city_code(self):
url = 'https://js.51jobcdn.com/in/js/2016/layer/area_array_c.js'
req = requests.get(url, headers=self.header).text
a = req.find(self.city)
return req[a - 9:a - 3]
数据的存储
csv
读取文件路径后,根据Spider方法里返回的列表,进行读取写入
if os.path.exists(self.path):
data_list = []
self.path = os.path.join(self.path, 'save_list')
while not self.jobqueue.empty():
data_list.append(self.jobqueue.get())
with open(os.path.join(self.path, 'info——job——{}——zone{}.csv'.format(self.keyword, self.city)), 'w',
newline='', encoding='utf-8') as f:
f_csv = csv.DictWriter(f, self.csv_header)
f_csv.writeheader()
f_csv.writerows(data_list)
json
这个是通过数据库排序输出获得的
import pymysql
def check():
con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8')
cursor = con.cursor()
sql = "SELECT * FROM `jobinfo` order by salary desc;"
cursor.execute(sql)
con.commit()
results = cursor.fetchall()
col_names = ['job_name','salary', 'company_name', 'work_area','company_hangye','detail']
strs = {}
for row in results:
strs[row[0]] = dict(zip(col_names, row[2:]))
cursor.close()
con.close()
result = json.dumps(strs)
filename = 'gongsi.json'
with open(filename, 'w') as file_obj:
file_obj.write(result)
excel
import xlsxwriter as xlsxwriter
workbook = xlsxwriter.Workbook(r'C:\Users\SixStart\PycharmProjects\pythonProject1\期末\python.xlsx')
worksheet = workbook.add_worksheet()
worksheet.write("A1", "职位名称")
worksheet.write("B1", "薪资(万/月)")
worksheet.write("C1", "公司名称")
worksheet.write("D1", "工作地点")
worksheet.write("E1", "发布时间")
worksheet.write("F1", "公司链接")
worksheet.write("G1", "公司类型")
worksheet.write("H1", "公司行业")
worksheet.write("L1", "工资")
worksheet.write("M1", "次数")
salarylist = []
for i in range(1, len(data_list)):
job_name = data_list[i]['职位名称']
salary = data_list[i]['薪资(万/月)']
company_name = data_list[i]['公司名称']
work_area = data_list[i]['工作地点']
time = data_list[i]['发布时间']
detail = data_list[i]['公司链接']
company_type = data_list[i]['公司类型']
company_hangye = data_list[i]['公司行业']
salarylist.append(salary)
try:
worksheet.write("A{}".format(i+1), job_name)
worksheet.write("B{}".format(i+1), salary)
worksheet.write("C{}".format(i+1), company_name)
worksheet.write("D{}".format(i+1), work_area)
worksheet.write("E{}".format(i+1), time)
worksheet.write("F{}".format(i+1), detail)
worksheet.write("G{}".format(i+1), company_type)
worksheet.write("H{}".format(i+1), company_hangye)
except:
pass
workbook.close()
mysql
CREATE TABLE `jobinfo` (
`id` int(11) NOT NULL AUTO_INCREMENT COMMENT '信息编号',
`job_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '职位名称',
`salary` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NULL DEFAULT '面议' COMMENT '薪资',
`company_name` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司名称',
`work_area` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '工作地点',
`time` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '发布时间',
`detail` varchar(255) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司链接',
`company_type` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司类型',
`company_hangye` varchar(50) CHARACTER SET utf8 COLLATE utf8_general_ci NOT NULL COMMENT '公司行业',
PRIMARY KEY (`id`) USING BTREE
) ENGINE = InnoDB AUTO_INCREMENT = 48561 CHARACTER SET = utf8 COLLATE = utf8_general_ci ROW_FORMAT = Dynamic;
SET FOREIGN_KEY_CHECKS = 1;
import pymysql
con = pymysql.connect(host='localhost', port=3306, user='root', password='0', database='qcwy', charset='utf8')
cursor = con.cursor()
sql = "INSERT INTO `qcwy`.`jobinfo`(`id`, `job_name`, `salary`, `company_name`, `work_area`, `time`, `detail`, `company_type`, `company_hangye`) VALUES"
sqlv = "(null,'{}','{}','{}','{}','{}','{}','{}','{}');".format(job_name,salary,company_name,work_area, time, detail, company_type,company_hangye)
sql2 = sql + sqlv
try:
cursor.execute(sql2)
con.commit()
except:
con.rollback()
cursor.close()
con.close()
展示数据
这个是和爬取分开的,所以重新打开了excel表读取(毕竟不能每次运行都一次爬几百个网页,防止我人无了)
import matplotlib.pyplot as plt
from openpyxl import load_workbook
def showpicuure():
salarymoneylist = []
salarycountlist = []
wb = load_workbook('python.xlsx')
ws = wb.active
for col in ws['L']:
if isinstance(col.value, str):
salarymoneylist.append(col.value)
salarymoneylist.remove("工资")
for col in ws['M']:
if isinstance(col.value, int):
salarycountlist.append(col.value)
plt.style.use('ggplot')
customers_index = range(len(salarymoneylist))
fig = plt.figure(figsize=(15, 3), dpi=100)
ax1 = fig.add_subplot(1, 1, 1)
ax1.bar(customers_index, salarycountlist, align='center', color='darkblue')
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
plt.xticks(customers_index, salarymoneylist, rotation=0, fontsize='small')
plt.xlabel('nums')
plt.ylabel('salary')
plt.title('Salary—Statistics—{}')
plt.figure(dpi=80)
plt.savefig('salary.png', bbox_inches='tight')
plt.show()
运行
源代码在本博客开头哦,我博客上的这些当然是运行不了的,只能作为源代码的讲解
if __name__ == '__main__':
key1 = input("请输入关键词")
city1 = input("请输入城市")
zhaopin(keyword=key1, city=city1).run()
check()
showpicuure()
注意
不是每次都能运行成功,但是大部分都没什么问题,可能和网络质量有关,如果报错的gbk编码那些,就和网速有关,也可能是我引入的faker 库的问题
|