IT数码 购物 网址 头条 软件 日历 阅读 图书馆
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
图片批量下载器
↓批量下载图片,美女图库↓
图片自动播放器
↓图片自动播放器↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁
 
   -> Python知识库 -> python获取职位信息 -> 正文阅读

[Python知识库]python获取职位信息

?51job爬取完整代码:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


# 获取职位信息
def jobMesssage(item):
    df = pd.DataFrame()
    item.list = item.find_all('a', attrs={'class': 'el'})  # 获取招聘岗位信息
    for i, item in enumerate(item.list):
        try:
            df['招聘职位网址'] = item.get('href'),
            df['岗位名称'] = item.find_all('span')[0].text,
            df['发布日期'] = item.find_all('span')[1].text,
            df['薪资'] = item.find_all('span')[2].text,  #
            df['工作地及要求'] = item.find_all('span')[3].text,  #
            # df_all=pd.concat([df,df_all],axis=1)
            item.list = item.find_all('p', attrs={'class': 'tags'})
            for i, item.list in enumerate(item.list):
                df['福利'] = item.list.get('title'),  #
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')
    return df


# 获取职位对应公司信息
def jobFirm(item):
    df = pd.DataFrame()
    item.list = item.find_all('div', attrs={'class': 'er'})  # 获取招聘公司信息
    for i, item in enumerate(item.list):
        # print(item,i,sep=',')
        # print(item.find_all('p')[1].text)
        try:
            df['招聘公司网址'] = item.find('a').get('href'),
            df['公司名称'] = item.find('a').text,
            df['公司规模'] = item.find_all('p')[0].text,
            df['所属行业'] = item.find_all('p')[0].text,
            print(str(i), '招聘公司写入正常')
        except:
            print(str(i), '招聘公司写入异常')
    return df


# 职位要求
def jobRequire(html):
    df = pd.DataFrame()
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'tHeader tHjob'})
    html.list = html.find_all('div', attrs={'class': 'tCompany_main'})
    # print(html.list)
    for i, item in enumerate(html.list):
        try:
            # contactInf=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('span').text.strip('') #联系方式
            # officeAddress=item.find_all('div', attrs={'class': 'tBorderTop_box'})[1].find('p').text#上班地址
            jobRequir_a = item.find('div', attrs={'class': 'tBorderTop_box'}).text.strip('').replace('\n', '').replace(
                '\t', '') \
                .replace(' ', '')  # 任职要求
            # print(jobRequir_a, i, sep='')
            item.list = item.find('div', attrs={'class': 'tBorderTop_box'}).find_all('p')
            jobRequir = []  # 职位要求
            for i, item in enumerate(item.list):
                jobRequir.append(item.text.strip('') + '\n')
                jobRequirText = ''.join(jobRequir)
                # print(jobRequirText)
                # print(jobRequirText.find('任职要求'))
                if jobRequirText.find('任职要求') > 0:
                    df['招聘要求'] = jobRequirText,
                else:
                    df['招聘要求'] = jobRequir_a,
            # print(df)
            print(str(i), '职位信息写入正常')
        except:
            print(str(i), '职位信息写入异常')
    return df


# 招聘公司信息获取
def firmMeessage(html):
    df = pd.DataFrame()
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    html.list = html.find_all('div', attrs={'class': 'tCompany_full'})
    # print(html.list)
    for i, item in enumerate(html.list):
        item.list = item.find_all('div', attrs={'class': 'tBorderTop_box'})
        # print(item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''))
        # for i, item in enumerate(item.list):
        #     print(item.text,i,sep='')
        try:
            df['公司信息'] = item.list[0].text.strip('').replace('\n', '').replace('\t', '').replace(' ', ''),
            # print(df)
            print(str(i), '公司信息写入正常')
        except:
            print(str(i), '公司信息写入异常')

    return df
class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        new_worksheet.range('l:l').row_height = 20
        new_worksheet.range('l:l').column_width = 11
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期', '是否投递']
        new_worksheet['A1'].value = title
        for i in range(len(self.data)):
            new_worksheet.cells[i + 1, 0].value = i + 1
            new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
            new_worksheet.cells[i + 1, 2].value = data[i]['发布日期']
            new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
            new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
            new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
            new_worksheet.cells[i + 1, 6].value = data[i]['公司规模']
            new_worksheet.cells[i + 1, 7].value = data[i]['所属行业']
            new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
            # new_worksheet.cells[i + 1, 9].value = data[i]['招聘要求']
            new_worksheet.cells[i + 1, 10].value = data[i]['招聘公司网址']
            # new_worksheet.cells[i + 1, 11].value = data[i]['公司信息']
            new_worksheet.cells[i + 1, 12].value = data[i]['福利']
            # 修改项目
            new_worksheet.cells[i + 1, 13].value = key  # 关键字
            new_worksheet.cells[i + 1, 14].value = '15-40K' if salary == '08%252c09%252c10' else '20-30K'  # 薪资范围
            new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

            print(str(i), 'Excel数据写入正常')
        new_worksheet.autofit()
        new_workbook.save('jobGain.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['前程无忧']
    rng_firmMeessage = [i for i in sh.range("k:k").value if i != None]  # 单元格内容
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(job_url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            html = myWeb.web_a(rng_jobRequire[i + 1])  # 获取招聘要求信息
            print(rng_jobRequire[i + 1])
            df4 = jobRequire(html)
            print(df4)
            # print(df4.index)
            # print(df4.iloc[0,0])
            sh.cells[i + 1, 9].value = df4.iloc[0, 0]
            print(str(i), '招聘要求写入正常')

            html = myWeb.web_b(rng_firmMeessage[i + 1])  # 获取公司信息
            print(rng_firmMeessage[i + 1])
            df5 = firmMeessage(html)
            print(df5)
            sh.cells[i + 1, 11].value = df5.iloc[0, 0]
            print(str(i), '公司信息写入正常')

        except:
            print(str(i), "数据查询错误")

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()

class Web:
    def __init__(self, url):
        self.url = url

    def web(self):
        # with open('jobhtml.html', 'r', encoding='utf-8') as f:
        # job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'
        driver.back()
        time.sleep(0.3)
        driver.get(self.url)  # 加载网址
        time.sleep(1 + 1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'j_joblist'})
        return html.list

    # 招聘需求信息获取
    def web_a(self, url):
        job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
        driver.back()
        driver.get(job_url)  # 加载网址
        time.sleep(0.3 + 0.4)
        driver.get(url)  # 加载网址
        time.sleep(1.2 + 1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html

    # 招聘公司信息获取
    def web_b(self, url):
        job_url = 'https://www.baidu.com/?tn=21002492_18_hao_pg'
        driver.back()
        driver.get(job_url)  # 加载网址
        time.sleep(0.5 + 0.5)
        driver.get(url)  # 加载网址
        time.sleep(1.2 + 1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        # print(html)
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html


key = '配送'  # 物流经理#物流运营#物流管理【#运营#物流#数据#运输#仓储#配送】
salary = '08%252c09%252c10'  # 08表示1.5-20K,09表示20-30k,08%252c09%252c10表示1.5-20K,20-30K,30K以上
timeday = '3'  # 1表示近三天,2表示近一周,近一个月是3
if __name__ == "__main__":

    opt = FirefoxOptions()  # ChromeOptions()  # 创建chrome参数
    # 不加载图片
    opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # driver.set_window_size(500, 900)
    # options = FirefoxOptions()
    # selenium = webdriver.Firefox(options=options)

    job_url = 'https://search.51job.com/list/080200,000000,0000,00,9,99,%25E7%2589%25A9%25E6%25B5%2581,2,1.html?'
    # 杭州,2-3万'https://search.51job.com/list/080200,000000,0000,00,9,09,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
    # 杭州1.5-2'https://search.51job.com/list/080200,000000,0000,00,9,08,%25E7%2589%25A9%25E6%25B5%2581%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='

    # # 招聘需求信息获取
    # myWeb = Web(job_url)  # 实例化类
    # time.sleep(0.2)
    # html = myWeb.web_a('https://jobs.51job.com/hangzhou-scq/125683481.html?s=sou_sou_soulb&t=0_0')  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
    # # df4 = jobRequire(html)  # 获取职位需求信息
    # df4 = jobRequire()
    # print(df4)
    # time.sleep(0.3)
    '''
    # 取前三页数据
    df = pd.DataFrame()  # 定义pands整理表格
    for i in range(6):
        try:  # '+str(i+1)+'#08表示1.5-20K,09表示20-30k
            print(str(i), '获取第{}页数据'.format(i + 1))
            job_url = 'https://search.51job.com/list/080200,000000,0000,21,' + timeday + ',' + salary + ',' + key + ',2,' + str(
                i + 1) + '.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&&jobterm=99&companysize=99&ord_field=1&dibiaoid=0&line=&welfare='#03%252c04&大专和本科学历
            print(job_url)
            'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=03%252c04&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加学历degreefrom
            'https://search.51job.com/list/080200,000000,0000,21,3,08%252c09%252c10,%25E8%25BF%2590%25E8%2590%25A5,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='#增加薪资
            'https://search.51job.com/list/080200,000000,0000,21,3,09,%25E8%25BF%2590%25E8%2590%25A5,2,2.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
            # 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
            # 'https://search.51job.com/list/080200,000000,0000,00,1,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
            # 'https://search.51job.com/list/080200,000000,0000,00,3,09,%25E7%2589%25A9%25E6%25B5%2581%25E7%25BB%258F%25E7%2590%2586,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare='
            # with open('jobhtml.html', 'r', encoding='utf-8') as f:
            #     html = BeautifulSoup(f, 'html.parser')
            #     html.list = html.find_all('div', attrs={'class': 'j_joblist'})
            time1 = time.time()  # 计算时长
            myWeb = Web(
                job_url)  # 实例化类  # 'https://jobs.51job.com/hangzhou-yhq/135494019.html?s=sou_sou_soulb&t=0_0')  # 实例化网址
            time.sleep(1 + 1)
            html = myWeb.web()
            # print(html)
            for i, item in enumerate(html):
                # print(item,i,sep=',')
                item.list = item.find_all('div', attrs={'class': 'e'})  # 获取每个招聘岗位条目
                for i, item in enumerate(item.list):
                    df1 = jobMesssage(item)  # 获取岗位
                    # print(df1['招聘职位网址'])
                    df2 = jobFirm(item)  # 获取公司
                    url = str(df1['招聘职位网址'].values).strip("['").strip("']").strip('')
                    print(url)
                    url_b = str(df2['招聘公司网址'].values).strip("['").strip("']").strip('')
                    print(url_b)
                    '''''''
                    # 招聘需求信息获取
                    myWeb = Web(job_url)  # 实例化类
                    time.sleep(0.3)
                    html = myWeb.web_a(
                        url)  # 'https://jobs.51job.com/hangzhou/135496109.html?s=sou_sou_soulb&t=0_0') # 实例化网址
                    df4 = jobRequire(html)  # 获取职位需求信息
                    print(df4)
                    time.sleep(0.5 + 0.5 + 0.5)

                    # 招聘公司信息获取
                    myWeb = Web(job_url)  # 实例化类
                    time.sleep(0.3)
                    html = myWeb.web_b(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                    df5 = firmMeessage(html)  # 获取职位需求信息
                    print(df5)
                    time.sleep(0.5 + 0.5 + 0.5)
                    
                    df3 = pd.concat([df1, df2], axis=1)
                    df6 = pd.concat([df3, df4], axis=1)
                    df7 = pd.concat([df5, df6], axis=1)
                    df7.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
                    df = pd.concat([df, df7], axis=0)
                    ''''''
                    df3 = pd.concat([df1, df2], axis=1)
                    df = pd.concat([df, df3], axis=0)
                    print(df)
                    df.to_json('jobGain.json', orient='records', indent=1, force_ascii=False)
                    time.sleep(0.5 + 0.5 + 0.5)
            time.sleep(0.5 + 0.5 + 0.5)
            print(str(i), '数据正常'.format(i + 1))

            time2 = time.time()  # 计算时长
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))
    '''
    '''
    # key = '物流管理'  # 物流经理#物流运营
    # salary = '08'  # 08表示1.5-20K,09表示20-30k
    with open('jobGain.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''
    #单独写入excl
    write_only()

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')







猎聘网爬取数据:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


# 获取招聘职位信息
def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'left-list-box'})
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-detail-box'})
        for i, item in enumerate(item.list):
            # print(item, i, sep=',')
            # print(item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={'class': 'job-salary'}).text,i,sep=',')
            try:
                df_jobMesssage['招聘职位网址'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).get('href'),
                df_jobMesssage['岗位名称'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div', attrs={
                    'class': 'job-title-box'}).text.strip('').replace('\n', '').replace('\t', ''),
                df_jobMesssage['工作地及要求'] = item.find('a', attrs={'data-nick': 'job-detail-job-info'}).find('div',
                                                                                                           attrs={
                                                                                                               'class': 'job-labels-box'}).text.strip(
                    '').replace('\n', '').replace('\t', ''),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'data-nick': 'job-detail-company-info'}).find('div',
                                                                                                               attrs={
                                                                                                                   'class': 'job-company-info-box'}).text.strip(
                    '').replace('\n', '').replace('\t', '')
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-detail-header-box'}).find('span', attrs={
                    'class': 'job-salary'}).text

                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                # df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)

                print(str(i), '招聘职位写入正常')
            except:
                print(str(i), '招聘职位写入正常')

    return df


# 获取招聘要求和公司信息
def jobRequire(url):
    df = {}  # 定义字典
    # url='https://www.liepin.com/a/29686195.shtml?d_sfrom=search_prime&d_ckId=c8f01cee484fdfafc8e1e5d047a1e1d1&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=38'
    cookie = 'Cookie: __uuid=1632571874000.95; __s_bid=11011704223d5f9c92ff5bd3e81bc8334a74; __tlog=1632611231431.79%7C00000000%7C00000000%7C00000000%7C00000000; Hm_lvt_a2647413544f5a04f00da7eee0d5e200=1632571900,1632611231; Hm_lpvt_a2647413544f5a04f00da7eee0d5e200=1632615070; __session_seq=12; __uv_seq=12'
    headers = {
        'user-agent': 'User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36',
        'Cookie': cookie,
        'Connection': 'keep - alive'
    }
    # 新闻链接
    # session = requests.session()
    res = requests.get(url=url, headers=headers, timeout=30)
    res.encoding = 'utf-8'
    res.raise_for_status()
    res.encoding = res.apparent_encoding
    html = BeautifulSoup(res.text, 'html.parser')
    time.sleep(0.1)
    # print(html)
    # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    html.list = html.find_all('content')  # 整体框架
    for i, item in enumerate(html):
        # item.list = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text#上级框架
        # print(item.list)
        try:
            df['招聘要求'] = item.find_all('section', attrs={'class': 'job-intro-container'})[0].text.strip('\n'),
            df['公司信息'] = item.find_all('section', attrs={'class': 'company-intro-container'})[0].text.strip('\n'),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)
            print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')

    return df


class Web:
    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        driver.back()
        time.sleep(0.3)
        driver.get(self.url)  # 加载网址
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'left-list-box'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

    # 获取招聘要求和公司信息
    def web_a(self, url):
        driver.back()
        time.sleep(0.3)
        driver.get(url)  # 加载网址
        time.sleep(1)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('content')  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list


class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
        new_worksheet['A1'].value = title

        for i in range(len(self.data)):
            try:
                df_w = jobRequire(data[i]['招聘职位网址'])
                print(data[i]['招聘职位网址'])
                # if i%9==8:
                #     time.sleep(20)#每取8个停下8秒应对反扒
                # else:
                #     time.sleep(0.2)

                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                new_worksheet.cells[i + 1, 9].value = df_w[
                    '招聘要求']  # str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                new_worksheet.cells[i + 1, 11].value = df_w[
                    '公司信息']  # str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = salary  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围
                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')

        # new_worksheet.autofit()
        new_workbook.save('jobliepin.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


df=pd.DataFrame()#定义    全局变量
key='配送'#物流经理#物流运营【#运营#物流#数据#运输#仓储#配送】
salary='15$30'#20$40#10$20【15$30】
timeday='7'#7表示一周内,3表示3天内,30表示近一个月
if __name__ == "__main__":
    # jobRequire()
    opt = ChromeOptions()  # 创建chrome参数
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    for i in range(6):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))

            job_url = 'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=' + str(
                key) + '&industry=9$250&dq=070020&salary=' + salary + '&pubTime=' + timeday + '&currentPage=' + str(i)
            print(job_url)
            'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=l07j4hoqgyh0gdr1cskm5ur0c8umz86a&oldCkId=84c318d34f244edd65090ab5353419c3&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=9$250&dq=070020&salary=15$30&pubTime=7&customSalary=1&currentPage=1&scene=page'
            'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&ckId=6aa2zbc9ptmwb1w7909zc2vm047p6uib&fkId=myu97a638ugqeosooma1w35gexn1tw78&skId=44ef33b0864b17ba4a80114662f6d01a&sfrom=search_job_pc&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&industry=&dq=070020&salary=15$30&pubTime=7&customSalary=1&scene=condition'
            'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&ckId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=20$40&pubTime=3&currentPage=1'
            'https://www.liepin.com/zhaopin/?headId=12baac27653545ffceb6a268fc0c82aa&key=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&dq=070020&salary=10$20&pubTime=3'
            'https://www.liepin.com/zhaopin/?headId=9f577a23fdb5d9437efff7679944c610&key=%E7%89%A9%E6%B5%81%E7%AE%A1%E7%90%86&dq=070020&salary=20$40&pubTime=3'
            # job_url_a='https://www.liepin.com/a/30216633.shtml?d_sfrom=search_prime&d_ckId=10e193c94fdc8095c14815c02246e6e7&d_curPage=0&d_pageSize=40&d_headId=6ae8e76ae415c8d307347eef4182b4e4&d_posi=2'
            time1 = time.time()  # 计算时长

            # 获取招聘职位信息
            myWeb = Web(job_url)  # 实例化类
            html = myWeb.web()  # 招聘要求和公司信息
            time.sleep(0.5)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobliepin.json', orient='records', indent=1, force_ascii=False)

            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))

    # 写入excel
    with open('jobliepin.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')



BOss直聘网站:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.firefox.options import Options as FirefoxOptions
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '公司信息写入正常')
            except:
                print(str(i), '公司信息写入异常')
    return df
def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入异常')

    return df
class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期','是否投递']
        new_worksheet['A1'].value = title
        new_worksheet.range('l:l').row_height=20
        new_worksheet.range('l:l').column_width=11

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        # 招聘公司信息获取
        for i in range(len(self.data)):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5+1)
                html = myWeb.web_a(data[i]['招聘职位网址'],i)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(2)
                new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
                new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')

        # new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()

# 单独写入excl
def write_only():
    app = xw.App(visible=True, add_book=False)
    wb = app.books.open('职业发展-only.xlsx')
    # 创建一个worksheet
    sh = wb.sheets['Boss']
    rng_jobRequire = [i for i in sh.range("i:i").value if i != None]  # 单元格内容
    j = sh.range('a1').expand('table').rows.count  # 序号
    app.display_alerts = False
    # app.screen_updating = False
    myWeb = Web(url)  # 实例化类
    for i in range(len(rng_jobRequire) - 1):
        try:
            html = myWeb.web_a(rng_jobRequire[i + 1],i) # 获取招聘要求信息
            print(rng_jobRequire[i + 1])
            df_w = jobRequire(html)
            print(df_w)
            # print(df4.index)
            # print(df4.iloc[0,0])
            sh.cells[i + 1, 9].value = df_w['招聘要求']
            sh.cells[i + 1, 11].value = df_w['公司信息']
            # print(str(i), '招聘要求写入正常')

        except:
            print(str(i), "数据查询错误")

    # sh.autofit()
    wb.save('职业发展-only.xlsx')
    wb.close()
    app.quit()
class Web:

    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        # driver.get('https://www.baidu.com/')  # 加载网址
        # driver.refresh()
        # driver.get('https://www.baidu.com/')  # 加载网址
        # driver.back()
        # driver.refresh()
        # time.sleep(0.5+0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1.5+0.5+0.5+0.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

        # 获取招聘要求和公司信息


    def web_a(self, url,i):

        driver.get('https://www.baidu.com/')  # 加载网址
        driver.refresh()
        driver.get('https://www.baidu.com/')  # 加载网址
        driver.back()
        # driver.refresh()
        # print('回退刷新')
        time.sleep(0.5+1)
        driver.get(url)  # 加载网址
        # driver.refresh()
        # print('刷新')
        time.sleep(1)
        #隔8次刷新一次
        if i%8==7:time.sleep(13),print('刷新{}次'.format(i))# driver.refresh(),

        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)

        print('当前运行条目数',i)
        return html.list
df = pd.DataFrame()  # 定义 salary   全局变量
key = '数据'  # # 物流经理#物流运营#物流管理【#运营#物流#数据#运输#仓储#配送】
salary = '6'  # 5表示15-20K,6表示20-30k

if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    #'https://m.zhipin.com/job_detail/?city=101280600&source=10&query=%E6%9D%AD%E5%B7%9E'
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    # prefs = {"profile.managed_default_content_settings.images": 2}
    # opt.add_experimental_option("prefs", prefs)
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)

    '''
    opt = FirefoxOptions()  # ChromeOptions()
    #加载图片
    # opt.headless = False  # 显示浏览器
    # driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    # # driver.set_window_size(400, 900)

    # 不加载图片
    opt.set_preference('permissions.default.image', 2)
    opt.headless = False  # 显示浏览器
    driver = webdriver.Firefox(options=opt)  # Chrome(options=opt)  # 浏览器实例化
    '''

    url = 'https://www.zhipin.com/i100502-c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
    '''
    for i in range(6):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))
            url = 'https://www.zhipin.com/i100502-c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
                i + 1) + '&ka=page-' + str(i + 1)
            print(url)
            # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
            # 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
            # ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
            time1 = time.time()  # 计算时长
            # 获取招聘职位信息
            myWeb = Web(url)
            html = myWeb.web()  # 获取招聘岗位信息
            # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
            time.sleep(0.5+1+1+0.5+0.5+0.5+1)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
            # print(url_b)
            # # 招聘公司信息获取
            # myWeb = Web(url)  # 实例化类
            # time.sleep(0.3)
            # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
            # df2 = jobRequire(html)  # 获取职位需求信息
            # print(df2)
            # time.sleep(0.5)
            #
            # df3 = pd.concat([df1, df2], axis=1)
            # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df = pd.concat([df, df3], axis=0)
            # print(df)
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # time.sleep(0.5)
            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))

    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''

    #单独写入excel
    write_only()


    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

boss完整代码2-单独提取职位要求和公司信息:

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '招聘职位写入正常')
            except:
                print(str(i), '招聘职位写入正常')
    return df


def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')

    return df


class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
        new_worksheet['A1'].value = title

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == 6 else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        # 招聘公司信息获取
        # for i in range(len(self.data)):
        #     try:
        #         # 招聘公司信息获取
        #         time1 = time.time()  # 计算时长
        #         myWeb = Web(url)  # 实例化类
        #         time.sleep(0.5)
        #         html = myWeb.web_a(data[i]['招聘职位网址'])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
        #         df_w = jobRequire(html)  # 获取职位需求信息
        #         print(df_w)
        #         time.sleep(2.5)
        #         new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
        #         new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
        #         print(str(i), 'Excel数据-2模块写入正常')
        #         time2 = time.time()  # 计算时长
        #         print('总耗时:{}'.format(time2 - time1))
        #     except:
        #         print(str(i), 'Excel数据-2模块写入异常')

        new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()
    def wE_r_a(self):
        app = xw.App(visible=True, add_book=False)
        wb=app.books.open('jobBoss.xlsx')
        sh=wb.sheets['worksheet']
        # print(sh.range('i2').value)
        rng = [i for i in sh.range("i:i").value if i != None]# 单元格内容招聘网址
        print(rng)
        # j = sh.range('a1').expand('table').rows.count
        # print(j)
        app.display_alerts = False
        app.screen_updating = False
        for i in range(len(rng) - 1):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5)
                html = myWeb.web_a(rng[i+1])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(2.5)
                sh.cells[i + 1, 9].value = df_w['招聘要求']
                sh.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')

        sh.autofit()
        wb.save('jobBoss.xlsx')
        wb.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()
    def run_a(self):
        pf = multiprocessing.Process(target=self.wE_r_a())
        pf.start()
        pf.join()


class Web:
    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        driver.back()
        # driver.refresh()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

        # 获取招聘要求和公司信息

    def web_a(self, url):
        driver.back()
        # driver.refresh()
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        time.sleep(1.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list


df = pd.DataFrame()  # 定义    全局变量
key = '物流管理'  # 物流经理#物流运营
salary = '6'  # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    prefs = {"profile.managed_default_content_settings.images": 2}
    opt.add_experimental_option("prefs", prefs)
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    driver.set_window_size(300, 700)
    url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
    '''
    for i in range(3):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))
            url='https://www.zhipin.com/c101210100/y_'+salary+'/?query='+key+'&city=101210100&industry=&position=&ka=sel-salary-'+salary+'&page='+str(i+1)+'&ka=page-'+str(i+1)
            print(url)
            #'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
            #'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
            #'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
            #‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
            time1 = time.time()  # 计算时长
            # 获取招聘职位信息
            myWeb=Web(url)
            html=myWeb.web()#获取招聘岗位信息
            # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
            time.sleep(0.5)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
            # print(url_b)
            # # 招聘公司信息获取
            # myWeb = Web(url)  # 实例化类
            # time.sleep(0.3)
            # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
            # df2 = jobRequire(html)  # 获取职位需求信息
            # print(df2)
            # time.sleep(0.5)
            #
            # df3 = pd.concat([df1, df2], axis=1)
            # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df = pd.concat([df, df3], axis=0)
            # print(df)
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # time.sleep(0.5)
            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))
    
    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程
    '''
    # 写入excel_a
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run_a()  # 执行多线程

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

对应招聘信息单独查询写入excel

import requests
from bs4 import BeautifulSoup
import datetime
import json
import xlwings as xw
from selenium import webdriver
import time
import pandas as pd
from selenium.webdriver import Chrome, ChromeOptions, ActionChains
from selenium.webdriver.common.keys import Keys
import csv
import multiprocessing
import os


def jobMesssage(html):
    df_jobMesssage = pd.DataFrame()
    df = pd.DataFrame()
    # with open('jobhtml.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-list'})
    # print(html.list)
    for i, item in enumerate(html):
        item.list = item.find_all('div', attrs={'class': 'job-primary'})
        # print(item,i,sep=',')
        for i, item in enumerate(item.list):  # 获取每个招聘条目
            # print(item, i, sep=',')
            try:
                item.list = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ', '').replace(
                    '\n', ' ')
                print(item.list, i, sep=',')

                df_jobMesssage['招聘职位网址'] = 'https://www.zhipin.com' + item.find('div',
                                                                                attrs={'class': 'primary-box'}).get(
                    'href'),
                df_jobMesssage['岗位名称'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-name'}).text,
                df_jobMesssage['工作地及要求'] = item.find('div', attrs={'class': 'job-title'}).find('span', attrs={
                    'class': 'job-area-wrapper'}).text.strip('\n'),  #
                df_jobMesssage['公司名称'] = item.find('div', attrs={'class': 'info-company'}).text.replace(' ',
                                                                                                        '').replace(
                    '\n', ' '),
                df_jobMesssage['薪资'] = item.find('div', attrs={'class': 'job-limit clearfix'}).text.strip('').replace(
                    '\n', ' '),
                df_jobMesssage['福利'] = item.find('div', attrs={'class': 'info-append clearfix'}).text.replace(' ',
                                                                                                              '').replace(
                    '\n', ' '),
                # print(df_jobMesssage)
                df_jobMesssage.to_csv('job.csv', mode='a+', header=None, index=True, encoding='utf-8-sig', sep=',')
                df = pd.concat([df, df_jobMesssage], axis=0)
                df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
                print(str(i), '招聘职位写入正常')
            except:
                print(str(i), '招聘职位写入正常')
    return df


def jobRequire(html):
    # df = pd.DataFrame()
    df = {}  # 定义字典
    # # url='https://www.zhipin.com/job_detail/c3aea253a5b3b2501nJ92d-9GFBR.html'
    # url='https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html'
    # # url='https://www.zhipin.com/job_detail/1635c904e28317c31nN63ti0FlJY.html'
    # cookie = 'Cookie: __guid=95203226.4063907470298592000.1630401055947.081; _bl_uid=tIkzmsaaz8bup1qepsempvm87k3z; wt2=Dt6B1sNjfS9mOw2rOUcWz7LnE65oG5AcG7C-7iuSGQ10DZgwjtuGdrBZlKOJt5QsEu8DWRIOSeNQ2a7qP7q1yRQ~~; lastCity=101210100; __g=-; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1630888771,1632789052,1632907583,1632959098; acw_tc=0bdd34ba16329610479403976e01a46b6a653805d48cc356c7a1254d2d5375; __c=1632959098; __a=66278464.1630401067.1632907554.1632959098.52.6.7.47; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1632962530; __zp_stoken__=0138dGiMjNjETJHpLDRQ2VDBYbnMRPGxPGRFeJC8TJ0Y%2FASEDIHMxYwBwZi8AHjN%2BTxwJVQgkUkJCHRMVQ3ACZm0YMWV2U1EgOHM5WnAVdzxse017agxTPj5JZUd4Q1w1DSU7fXVbUEcKIRY%3D; __zp_sseed__=iVynj0LLIRVDsqGYxrY8A2rJBiqFMuzEYl1KvBTzD1Q=; __zp_sname__=e948d594; __zp_sts__=1632962688132; monitor_count=40'
    # # cookie ='Cookie: HMACCOUNT_BFESS=399A131593FFAEE5; BDUSS_BFESS=VpjS3U5Q1hQd3ktdkMwand3N3k1ekppN1FJSUhSc2EtdVBEMGhBaU0zSEdYbEpoRVFBQUFBJCQAAAAAAAAAAAEAAADB320FNTMzNTg5NDkzAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAMbRKmHG0SphW; BAIDUID_BFESS=DA74B922ACBBFCBDF71367A36C973898:FG=1'
    # # cookie ='set-cookie: __zp_sseed__=iVynj0LLIRVDsqGYxrY8A7QRlGL1xd7z8VDrvc0yURg=; Path=/; Domain=.zhipin.com'
    # headers = {
    #     'user-agent': 'user-agent: Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36',
    #     'Cookie': cookie,
    #     'Connection': 'keep - alive',
    #     'Accept':'Accept: image / avif, image / webp, image / apng, image / *, * / *;q = 0.8',
    # }
    # # 新闻链接
    # # session = requests.session()
    # res = requests.get(url=url, headers=headers, timeout=30)
    # res.encoding = 'utf-8'
    # res.raise_for_status()
    # res.encoding = res.apparent_encoding
    # html = BeautifulSoup(res.text, 'html.parser')
    # time.sleep(3)
    # print(html)
    # # 存入本地
    # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
    #     f.write(res.text)
    # with open('jobhtmlText.html', 'r', encoding='utf-8') as f:
    #     html = BeautifulSoup(f, 'html.parser')
    # html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
    for i, item in enumerate(html):
        # print(item,1,sep=',')
        item.list = item.find_all('div', attrs={'class': 'text'})[0].text.strip('').replace(' ', '')
        print(item.list, i, sep=',')
        try:
            df['招聘要求'] = item.find_all('div', attrs={'class': 'text'})[0].text.strip('\n').replace(' ', '').replace(
                '\n', ' ').replace('\r', ' ').replace('\t', ' '),  # 上级框架,
            df['公司信息'] = item.find_all('div', attrs={'class': 'job-sec company-info'})[0].text.strip('\n').replace(' ',
                                                                                                                   ''),
            # df.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # print(df)
            print(str(i), '招聘职位写入正常')
        except:
            print(str(i), '招聘职位写入正常')

    return df


class writeExcel:
    def __init__(self, data):
        self.data = data
        # print(data)

    def wE_r(self):
        app = xw.App(visible=False, add_book=False)
        new_workbook = xw.Book()
        new_worksheet = new_workbook.sheets.add('worksheet')
        app.display_alerts = False
        app.screen_updating = False
        title = ["序号", "岗位名称", "发布日期", "薪资", "工作地及要求", "公司名称", "公司规模", "所属行业", "招聘职位网址", "招聘要求",
                 "招聘公司网址", "公司信息", '福利', '关键字', '薪资范围', '标记', '顺序', '记录日期']
        new_worksheet['A1'].value = title

        for i in range(len(self.data)):
            try:
                # df_w = jobRequire(data[i]['招聘职位网址'])
                # print(data[i]['招聘职位网址'])
                new_worksheet.cells[i + 1, 0].value = i + 1
                new_worksheet.cells[i + 1, 1].value = data[i]['岗位名称']
                new_worksheet.cells[i + 1, 2].value = ''  # data[i]['发布日期']
                new_worksheet.cells[i + 1, 3].value = data[i]['薪资']
                new_worksheet.cells[i + 1, 4].value = data[i]['工作地及要求']
                new_worksheet.cells[i + 1, 5].value = data[i]['公司名称']
                new_worksheet.cells[i + 1, 6].value = ''  # data[i]['公司规模']
                new_worksheet.cells[i + 1, 7].value = ''  # data[i]['所属行业']
                new_worksheet.cells[i + 1, 8].value = data[i]['招聘职位网址']
                # new_worksheet.cells[i + 1, 9].value =df_w['招聘要求']#str(df_w['招聘要求'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 10].value = ''  # data[i]['招聘公司网址']
                # new_worksheet.cells[i + 1, 11].value = df_w['公司信息']#str(df_w['公司信息'].values).strip("['").strip("']").strip('')
                new_worksheet.cells[i + 1, 12].value = ''  # data[i]['福利']
                # 修改项目
                new_worksheet.cells[i + 1, 13].value = key  # 关键字
                new_worksheet.cells[i + 1, 14].value = '20-30k' if salary == '6' else '15-20K'  # 薪资范围
                new_worksheet.cells[i + 1, 17].value = datetime.date.today()  # 薪资范围

                print(str(i), 'Excel数据写入正常')
            except:
                print(str(i), 'Excel数据写入异常')
        # 招聘公司信息获取
        for i in range(len(self.data)):
            try:
                # 招聘公司信息获取
                time1 = time.time()  # 计算时长
                myWeb = Web(url)  # 实例化类
                time.sleep(0.5)
                html = myWeb.web_a(data[i]['招聘职位网址'])  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
                df_w = jobRequire(html)  # 获取职位需求信息
                print(df_w)
                time.sleep(3)
                new_worksheet.cells[i + 1, 9].value = df_w['招聘要求']
                new_worksheet.cells[i + 1, 11].value = df_w['公司信息']
                print(str(i), 'Excel数据-2模块写入正常')
                time2 = time.time()  # 计算时长
                print('总耗时:{}'.format(time2 - time1))
            except:
                print(str(i), 'Excel数据-2模块写入异常')

        new_worksheet.autofit()
        new_workbook.save('jobBoss.xlsx')
        new_workbook.close()
        app.quit()

    def run(self):
        pf = multiprocessing.Process(target=self.wE_r())
        pf.start()
        pf.join()


class Web:
    def __init__(self, url):
        self.url = url

    # 获取招聘职位信息
    def web(self):
        driver.back()
        # driver.refresh()
        time.sleep(0.5)
        driver.get(self.url)  # 加载网址
        time.sleep(1.5)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-list'})
        # with open('jobhtml.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list

        # 获取招聘要求和公司信息

    def web_a(self, url):
        driver.back()
        # driver.refresh()
        print('回退刷新')
        time.sleep(0.5)
        driver.get(url)  # 加载网址
        # driver.refresh()
        # print('刷新')
        time.sleep(2)
        source = driver.page_source  # 页面内容实例化
        html = BeautifulSoup(source, 'html.parser')  # 获取页面内容
        html.list = html.find_all('div', attrs={'class': 'job-detail'})  # 整体框架
        # with open('jobhtmlText.html','w',encoding='utf-8-sig') as f:#gbk,utf-8-sig\gb2312
        #     f.write(source)
        # print(html)
        return html.list


df = pd.DataFrame()  # 定义 salary   全局变量
key = '物流管理'  # 物流经理#物流运营
salary = '5'  # 5表示15-20K,6表示20-30k
if __name__ == '__main__':
    # jobMesssage()
    # jobRequire()
    # opt = ChromeOptions()  # 创建chrome参数
    # opt.headless = False  # 显示浏览器
    # driver = Chrome(options=opt)  # 浏览器实例化
    # # driver=webdriver.Chrome()
    # driver.set_window_size(300, 700)
    # url='https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
    # url_b='https://www.zhipin.com/job_detail/63a31859fef2dbbc1nJy0tS8EFJY.html'
    # # 招聘公司信息获取
    # myWeb = Web(url)  # 实例化类
    # time.sleep(0.3)
    # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
    # df5 = jobRequire(html)  # 获取职位需求信息
    # print(df5)
    # time.sleep(0.5)

    opt = ChromeOptions()  # 创建chrome参数
    # 不加载图片
    prefs = {"profile.managed_default_content_settings.images": 2}
    opt.add_experimental_option("prefs", prefs)
    opt.headless = False  # 显示浏览器
    driver = Chrome(options=opt)  # 浏览器实例化
    # driver=webdriver.Chrome()
    driver.set_window_size(300, 700)

    url = 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'

    for i in range(3):  # +str(i);key=
        try:
            print(str(i), '获取第{}页数据'.format(i + 1))
            url = 'https://www.zhipin.com/c101210100/y_' + salary + '/?query=' + key + '&city=101210100&industry=&position=&ka=sel-salary-' + salary + '&page=' + str(
                i + 1) + '&ka=page-' + str(i + 1)
            print(url)
            # 'https://www.zhipin.com/job_detail/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&city=101210100&industry=&position='
            # 'https://www.zhipin.com/c101210100/y_6/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-6'
            # 'https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&ka=sel-salary-5'
            # ‘https://www.zhipin.com/c101210100/y_5/?query=%E7%89%A9%E6%B5%81%E8%BF%90%E8%90%A5&page=2&ka=page-2’
            time1 = time.time()  # 计算时长
            # 获取招聘职位信息
            myWeb = Web(url)
            html = myWeb.web()  # 获取招聘岗位信息
            # html=myWeb.web_a('https://www.zhipin.com/job_detail/c2b2f449e3c613a71nN72NS1FlpW.html')# 获取招聘要求和公司信息
            time.sleep(0.5)
            # print(html)
            df1 = jobMesssage(html)
            df = pd.concat([df1, df], axis=0)
            df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # url_b = str(df1['招聘公司网址'].values).strip("['").strip("']").strip('')
            # print(url_b)
            # # 招聘公司信息获取
            # myWeb = Web(url)  # 实例化类
            # time.sleep(0.3)
            # html = myWeb.web_a(url_b)  # 'https://jobs.51job.com/all/co3836624.html')  # 实例化网址
            # df2 = jobRequire(html)  # 获取职位需求信息
            # print(df2)
            # time.sleep(0.5)
            #
            # df3 = pd.concat([df1, df2], axis=1)
            # df3.to_csv('job.csv', mode='a+', header=None, index=None, encoding='utf-8-sig', sep=',')
            # df = pd.concat([df, df3], axis=0)
            # print(df)
            # df.to_json('jobBoss.json', orient='records', indent=1, force_ascii=False)
            # time.sleep(0.5)
            time2 = time.time()  # 计算时长
            print(str(i), '数据正常'.format(i + 1))
            print('总耗时:{}'.format(time2 - time1))
        except:
            print(str(i), '数据异常'.format(i + 1))

    # 写入excel
    with open('jobBoss.json', 'r', encoding='utf-8') as f:
        data = json.load(f)
        # print(data)
        myWe = writeExcel(data)  # 写入excel
        myWe.run()  # 执行多线程

    try:  # 关闭后台浏览器
        driver.close()
        driver.quit()
        os.system('taskkill /F /IM chromedriver.exe')  # 关闭进程浏览器
        sreach_windows = driver.current_window_handle
        # 获得当前所有打开的窗口的句柄
        all_handles = driver.window_handles
        for handle in all_handles:
            driver.switch_to.window(handle)
            driver.close()
            time.sleep(1.2)
    except:
        print('已完后台毕浏览器')

  Python知识库 最新文章
Python中String模块
【Python】 14-CVS文件操作
python的panda库读写文件
使用Nordic的nrf52840实现蓝牙DFU过程
【Python学习记录】numpy数组用法整理
Python学习笔记
python字符串和列表
python如何从txt文件中解析出有效的数据
Python编程从入门到实践自学/3.1-3.2
python变量
上一篇文章      下一篇文章      查看所有文章
加:2022-03-03 16:08:24  更:2022-03-03 16:11:08 
 
开发: C++知识库 Java知识库 JavaScript Python PHP知识库 人工智能 区块链 大数据 移动开发 嵌入式 开发工具 数据结构与算法 开发测试 游戏开发 网络协议 系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑 笔记本 显卡 显示器 固态硬盘 硬盘 耳机 手机 iphone vivo oppo 小米 华为 单反 装机 图拉丁

360图书馆 购物 三丰科技 阅读网 日历 万年历 2024年11日历 -2024/11/15 22:23:55-

图片自动播放器
↓图片自动播放器↓
TxT小说阅读器
↓语音阅读,小说下载,古典文学↓
一键清除垃圾
↓轻轻一点,清除系统垃圾↓
图片批量下载器
↓批量下载图片,美女图库↓
  网站联系: qq:121756557 email:121756557@qq.com  IT数码