部分源码 import copy import json import re import time
import scrapy from scrapy.http import Request
from zhaopinspider.items import PositionItem
class FiveonejobSpider(scrapy.Spider): name = ‘51jobSpider’ allowed_domains = [‘search.51job.com’,‘jobs.51job.com’] keyword = “” start_urls = [] #total_page = 1 # https://dataopen.liepin.com/basic/p/v2/getAllDq.json city_list = {“北京”: “010000”, “天津”: “050000”}
def __init__(self, keyword=None, city=None, page=None, *args, **kwargs):
self.baseurl = "https://search.51job.com"
super(FiveonejobSpider, self).__init__(*args, **kwargs)
self.keyword = keyword
self.city = city
self.city_code = self.city_list.get(city)
self.total_page = page
# self.num = num
def start_requests(self):
item = PositionItem()
url="https://jobs.51job.com/beijing/134749389.html?s=sou_sou_soulb&t=0_0"
yield Request(url=url, method="POST", callback=self.parse_detail, meta={'item': item})
# for page in range(1, int(self.total_page)+1):
# starturl = 'https://search.51job.com/list/'+self.city_code+',000000,0000,00,9,99,'+self.keyword+',2,'+str(page)+'.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
# print(starturl)
# yield Request(url=starturl, callback=self.parse_list)
def parse_list(self, response):
html = response.text
start_index = html.find('__SEARCH_RESULT__ = ') + len('__SEARCH_RESULT__ = ')+1
sub_html = html[int(start_index)-1: len(html)]
end_index = sub_html.find('</script>')
data_str = sub_html[0: int(end_index)]
#print(data_str)
data_json = json.loads(data_str)
#print(len(data_json['engine_search_result']))
for item_data in data_json['engine_jds']:
item = PositionItem()
item['site'] = "51job"
item['source_url'] = item_data['job_href']
item['city'] = self.city
item['keyword'] = self.keyword
item['name'] = item_data['job_name']
item['salary'] = item_data['providesalary_text'].strip()
if "万/月" in item['salary']:
salary = item['salary'].replace("万/月", "")
salary = salary.split("-")
item['min_salary'] = int(float(salary[0])*10000)
item['max_salary'] = int(float(salary[1])*10000)
elif "万以上/月" in item['salary']:
salary = item['salary'].replace("万以上/月", "")
item['min_salary'] = int(float(salary)*10000)
item['max_salary'] = 0
elif "千/月" in item['salary']:
salary = item['salary'].replace("千/月", "")
salary = salary.split("-")
item['min_salary'] = int(float(salary[0])*1000)
item['max_salary'] = int(float(salary[1])*1000)
elif "万/年" in item['salary']:
salary = item['salary'].replace("万/年", "")
salary = salary.split("-")
item['min_salary'] = int(float(salary[0])*10000/12)
item['max_salary'] = int(float(salary[1])*10000/12)
else:
print("continue", item)
continue
item['edu'] = item_data['attribute_text'][2]
item['work_time'] = item_data['attribute_text'][1]
if "年经验" in item['work_time']:
work_time = item['work_time'].strip().replace("年经验", "").split("-")
item['min_work_time'] = int(work_time[0])
if len(work_time) > 1:
item['max_work_time'] = int(work_time[1])
time.sleep(3)
yield Request(url=item['source_url'], callback=self.parse_detail, meta={'item': item})
# 解析详细列表
def parse_detail(self, response):
item = copy.deepcopy(response.meta['item'])
details = response.xpath('//div[@class="bmsg job_msg inbox"]/text()').extract()
need_skill = ""
print(details)
exit(1)
for info in details:
need_skill += info
item['need_skill'] = self.annaysisSkill(need_skill)
print(item)
yield item
def annaysisSkill(self, data_str):
line = re.findall(r"[a-zA-Z]+", data_str)
line_str = "\\".join(line)
return line_str
登录界面 数据爬取 数据可视化 文件结构:
完整源码获取: 链接:https://pan.baidu.com/s/160gfcbdLhp9F5_hS4tewsg?pwd=mdaj 提取码:mdaj
|