import time
from selenium import webdriver
import csv
from selenium.webdriver import ActionChains
fp = open('./51job_data_02.csv', mode='a', newline='', encoding='utf-8')
csv_writer = csv.writer(fp)
a = []
def login(driver):
num = 1
global title, area, experience, education, link,date, salary, message
driver.delete_all_cookies()
time.sleep(10)
url = [
"https://search.51job.com/list/000000,000000,0000,00,0,99,%25E5%25A4%25A7%25E6%2595%25B0%25E6%258D%25AE,2,{0}.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0&line=&welfare=".format(
i) for i in range(60, 160)]
for k2, k in enumerate(url):
try:
driver.get(k)
time.sleep(10)
except:
print('****************************')
time.sleep(30)
try:
driver.get(k)
except:
continue
url_3 = []
for id in range(1, 51):
o_ = 0
while o_ < 5:
try:
x = driver.find_element_by_xpath(
'/html/body/div[2]/div[3]/div/div[2]/div[4]/div[1]/div[{}]/a'.format(id))
url_3.append(x.get_attribute("href"))
break
except:
o_ += 1
time.sleep(3)
continue
for j in url_3:
li = []
po = 1
try:
driver.get(j)
time.sleep(3)
except:
time.sleep(40)
try:
driver.get(j)
except:
continue
kl = 0
while True:
try:
kl += 1
title = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/h1').text
break
except:
if kl == 2:
print('--------*****************----------')
time.sleep(2)
print('-----------*************************************************-------')
po = 0
break
else:
time.sleep(5)
continue
if po == 0:
continue
t1 = 0
while True:
try:
t1 += 1
area = \
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/p').get_attribute(
'title').split('|')[0]
area = area.strip()
experience = \
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/p').get_attribute(
'title').split('|')[1]
experience = experience.strip()
education = \
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/p').get_attribute(
'title').split('|')[2]
education = education.strip()
link=j
date = \
driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/p').get_attribute(
'title').split('|')[3]
date = date.strip()
date = date.replace('发布', '')
salary = driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[2]/div/div[1]/strong').text
message = '|' + driver.find_element_by_xpath('/html/body/div[2]/div[2]/div[3]/div[1]/div').text+'|'
break
except:
time.sleep(3)
if t1 == 3:
break
if t1 == 3:
continue
li.extend([title, area, experience, education,link,date, salary, message, ])
print(date)
try:
csv_writer.writerow(li)
print(num, k2)
num += 1
except:
pass
time.sleep(4)
del li
time.sleep(5)
def main():
"""
chromeOptions 是一个配置 chrome 启动是属性的类,就是初始化
"""
option = webdriver.ChromeOptions()
option.add_argument('--headless')
option.add_argument('--disable-gpu')
option.add_argument('--disable-dev-shm-usage')
"""
add_experimental_option 添加实验性质的设置参数
"""
option.add_experimental_option('excludeSwitches', ['enable-automation'])
'''
add_argument 添加启动参数
'''
option.add_argument("--disable-blink-features=AutomationControlled")
option.add_argument("--no-sandbox")
option.add_argument("--disable-dev-usage")
option.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
"""
Chrome 配置驱动
"""
driver = webdriver.Chrome(executable_path=r'chromedriver.exe',options=option)
driver.set_page_load_timeout(40)
login(driver)
if __name__ == '__main__':
username = 'x'
password = 'x'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36'}
main()
fp.close()
|