前言
毕业将近,大部分学生面临找工作的压力,如何快速的找到自己心仪的岗位并且及时投递简历成为同学们关心的问题,请设计和实现一个爬取主流招聘网站(例如拉勾网)招聘信息的爬虫。
一、爬虫是什么?
爬虫的本质是模拟浏览器打开网页,获取我们需要的数据内容并保存到数据库或者excel里。
二、爬取代码
代码如下(示例):
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import sqlite3
import os
from urllib import parse
kw = input("请输入搜索岗位:")
keywrod = parse.quote(parse.quote(kw))
def main():
print("开始爬取。。。")
baseurl = "https://search.51job.com/list/090200,000000,0000,00,9,99,"+ keywrod +",2,1.html"
datalist=getData(baseurl)
db_savepath="51job.db"
saveData_db(datalist,db_savepath)
print("爬取完毕!")
# 识别内容
findjob=re.compile(r'"is_special_job":"",(.*?),"adid":""')
# 职位详情链接
findjob_L=re.compile(r'"job_href":"(.*?)",')
# 职位名称
findjob_n=re.compile(r'"job_name":"(.*?)",')
# 公司详情链接
findcompany_L=re.compile(r'"company_href":"(.*?)",')
# 公司名称
findcompany_n=re.compile(r'"company_name":"(.*?)",')
# 提供薪水
findsalary=re.compile(r'"providesalary_text":"(.*?)",')
# 工作区域
findarea=re.compile(r'"workarea_text":"(.*?)",')
# 公司类型
findcompany_t=re.compile(r'"companytype_text":"(.*?)",')
# 职位福利
findjob_w=re.compile(r'"jobwelf":"(.*?)",')
# 公司规模
findcompany_s=re.compile(r'"companysize_text":"(.*?)",')
# 公司主营
findcompany_i=re.compile(r'"companyind_text":"(.*?)"')
# 其他
findjob_a=re.compile(r'"attribute_text":(.*?)]')
#爬取网页
def getData(baseurl):
datalist = []
for i in range(10):
url=baseurl+str(i)+".html"
html = askURL(url)
print("正在爬取第%d页数据..."%i)
# 解析数据
soup = BeautifulSoup(html, "html.parser")
ps=soup.find_all("script",type="text/javascript")
ps_l=str(ps[2])
for j_list in re.findall(findjob, ps_l):
data = []
job_href = re.findall(findjob_L, j_list)[0]
job_href = job_href.replace("\\", "")
data.append(job_href)
job_name = re.findall(findjob_n, j_list)[0]
job_name = job_name.replace("\\", "")
data.append(job_name)
company_href = re.findall(findcompany_L, j_list)[0]
company_href = company_href.replace("\\", "")
data.append(company_href)
company_name = re.findall(findcompany_n, j_list)[0]
company_name = company_name.replace("\\", "")
data.append(company_name)
providesalary = re.findall(findsalary, j_list)[0]
providesalary = providesalary.replace("\\", "")
data.append(providesalary)
workarea = re.findall(findarea, j_list)[0]
workarea = workarea.replace("\\", "")
data.append(workarea)
companytype = re.findall(findcompany_t, j_list)[0]
data.append(companytype)
jobwelf = re.findall(findjob_w, j_list)[0]
data.append(jobwelf)
companysize_text = re.findall(findcompany_s, j_list)[0]
data.append(companysize_text)
companyind_text = re.findall(findcompany_i, j_list)[0]
companyind_text = companyind_text.replace("\\", "")
data.append(companyind_text)
attribute_text = re.findall(findjob_a, j_list)[0]
attribute_text=attribute_text.replace('"',"")+"]"
data.append(attribute_text)
datalist.append(data)
print("爬取完毕!")
return datalist
#得到指定的网页内容
def askURL(url):
head={
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36 Edg/96.0.1054.53",
"Accept": "text / html, application / xhtml + xml, application / xml; q = 0.9, image / webp, image / apng, * / *;q = 0.8, application / signed - exchange; v = b3; q = 0.9",
#"Cookie": '''guid=129f518e4b1d964be3ec59fd44319ee3; _ujz=MTg0NDQxOTY5MA%3D%3D; nsearch=jobarea%3D%26%7C%26ord_field%3D%26%7C%26recentSearch0%3D%26%7C%26recentSearch1%3D%26%7C%26recentSearch2%3D%26%7C%26recentSearch3%3D%26%7C%26recentSearch4%3D%26%7C%26collapse_expansion%3D; 51job=cenglish%3D0%26%7C%26; search=jobarea%7E%60150300%7C%21ord_field%7E%600%7C%21recentSearch0%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%CE%DF%BA%FE%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21recentSearch1%7E%60150300%A1%FB%A1%FA000000%A1%FB%A1%FA0000%A1%FB%A1%FA00%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA99%A1%FB%A1%FA9%A1%FB%A1%FA99%A1%FB%A1%FA%A1%FB%A1%FA0%A1%FB%A1%FA%A1%FB%A1%FA2%A1%FB%A1%FA1%7C%21'''
}
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode('gbk')
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return html
#保存数据
def init_db(dbpath):
if os.path.exists(dbpath):
print("数据库文件已存在!")
else:
sql = '''
create table if not exists job(
id integer primary key autoincrement,
job_href text ,
job_name text ,
company_href text ,
company_name text ,
providesalary text ,
workarea text ,
companytype text ,
jobwelf text ,
companysize_text text ,
companyind_text text ,
attribute_text text
)
'''
conn=sqlite3.connect(dbpath)
cursor=conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
def saveData_db(datalist,savepath):
init_db(savepath)
conn=sqlite3.connect(savepath)
cur=conn.cursor()
for data in datalist:
for index in range(len(data)):
data[index]='"'+data[index]+'"'
sql='''
insert into job(
job_href,job_name,company_href,company_name,providesalary,workarea,companytype,jobwelf,companysize_text,companyind_text,attribute_text)
values(%s)'''%",".join(data)
cur.execute(sql)
conn.commit()
cur.close()
conn.commit()
print("爬取数据已存储到数据库文件!|",savepath)
if __name__ == '__main__':
main()
总结
<font color=#999AAA >爬取过程需要配置环境,具体的知识细节可以查询python教程。
|