from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
import time
import json
import csv
import random
from pyquery import PyQuery as pq
options = webdriver.ChromeOptions()
options.add_experimental_option('prefs', {'profile.managed_default_content_settings.images': 2})
browser = webdriver.Chrome(options=options)
browser.maximize_window()
wait = WebDriverWait(browser, 10)
url = 'https://www.51job.com/'
keyword = input('请输入职位:').strip()
data_list = []
def star_spider():
browser.get(url)
zhiwei_base = browser.find_elements_by_css_selector('.nlink a')
zhiwei_base[1].click()
browser.find_element_by_id('keywordInput').send_keys(keyword)
browser.find_element_by_id('search_btn').click()
time.sleep(5)
browser.find_element_by_class_name('allcity').click()
browser.find_element_by_class_name('ttag').click()
time.sleep(5)
browser.find_element_by_class_name('but_box').find_element_by_class_name('p_but').click()
time.sleep(5)
print('_________________________________________________________________________')
wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, 'next')
)
)
browser.execute_script('document.documentElement.scrollTop=10000')
time.sleep(random.randint(1, 3))
browser.execute_script('document.documentElement.scrollTop=0')
page = browser.find_element_by_class_name('rt.rt_page').text
print(page)
pages = page.split('/')
all_pages = pages[1].split()
return all_pages
def index_page():
all_page = star_spider()
count = 0
while True:
input = browser.find_element_by_id('jump_page')
input.clear()
input.send_keys(count+1)
browser.find_element_by_class_name('og_but').click()
print('第{}页开始...................................................................'.format(count+1))
try:
count += 1
'''wait.until(
EC.presence_of_all_elements_located(
(By.CLASS_NAME, '.j_joblist')
)
)
'''
browser.execute_script('document.documentElement.scrollTop=10000')
time.sleep(random.randint(1, 3))
browser.execute_script('document.documentElement.scrollTop=0')
print('###########################################################33')
get_projucts()
except Exception as e:
continue
print('第{}页完成...................................................................'.format(count))
if count == all_page:
break
def get_projucts():
'''
:return:
'''
html = browser.page_source
doc = pq(html)
items = doc('.j_joblist .e').items()
for item in items:
post = item.find('.jname').text()
post_link = item.find('.el').attr('href')
data_issue = item.find('.time').text()
salary = item.find('.sal').text()
extra = item.find('.d').text()
try:
tags = item.find('.tags').attr('title')
except:
tags = None
cname = item.find('.cname').text()
clink = item.find('.cname').attr('href')
ctpye = item.find('.dc').text()
cint = item.find('.int').text()
data_dict = {}
data_dict['post'] = post
data_dict['post_link'] = post_link
data_dict['data_issue'] = data_issue
data_dict['salary'] = salary
data_dict['extra'] = extra
data_dict['tags'] = tags
data_dict['cname'] = cname
data_dict['clink'] = clink
data_dict['ctpye'] = ctpye
data_dict['cint'] = cint
print(data_dict)
data_list.append(data_dict)
def main():
index_page()
print(data_list)
with open('data_json.json', 'a+', encoding='utf-8') as f:
json.dump(data_list, f, ensure_ascii=False, indent=4)
print('json文件写入完成')
with open('data_csv.csv', 'w', encoding='utf-8', newline='') as f:
title = data_list[0].keys()
writer = csv.DictWriter(f, title)
writer.writeheader()
writer.writerows(data_list)
print('csv文件写入完成')
if __name__ == '__main__':
main()
|