from os.path import join
from time import strftime
import xlrd
import xlwt
from xlutils.copy import copy
from selenium.webdriver import Firefox
import time
from selenium.webdriver.common.action_chains import ActionChains
import driver
from selenium.common.exceptions import TimeoutException, NoSuchElementException
excel_path = "D:\\tools" # 写入的excel文件路径
now_time = strftime("%Y-%m-%d-%H") # 获取时间戳为excel文件名和表名
# 创建excel追加数据到表中
class Write_Excel():
def __init__(self):
self.path = excel_path
self.name = now_time + ".xls"
self.filename = join(self.path, self.name)
# 设置表格样式
def set_style(self, name, height, bold=False):
style = xlwt.XFStyle()
font = xlwt.Font()
font.name = name
font.bold = bold
font.color_index = 4
font.height = height
style.font = font
return style
# 新建excel文件
def new_excel(self):
f = xlwt.Workbook()
f.add_sheet(now_time, cell_overwrite_ok=True) # 新增excel中表now_time
f.save(self.filename)
print("文件【%s】创建成功" % self.name)
# 对excel文件写入数据
def add_to_excel(self, values):
try:
workbook = xlrd.open_workbook(self.filename) # excel文件存在则直接打开
except FileNotFoundError: # excel文件不存在则先创建在打开
self.new_excel()
workbook = xlrd.open_workbook(self.filename)
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
if rows_old == 0: # 跳过首行写入,留存为标题行
rows_old = 1
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象,旧数据复制保存
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
i = 0
for value in values:
new_worksheet.write(rows_old, i, value) # 追加写入数据,从rows_old行开始写入
i += 1
print(" 文件【{0}】表【{1}】第【{2}】行追加数据{3}".format(self.name, now_time, rows_old, values))
# 定义标题行信息
title_row = ["名称", "位置", "申请者", "时间", " ", " ", " "]
for n in range(0, len(title_row)):
new_worksheet.write(0,n,title_row[n],self.set_style('Times New Roman',220, True))
new_workbook.save(self.filename) # 保存excel文件
if __name__ == "__main__":
# 找到领英网登录界面
web = Firefox()
web.get("https://www.linkedin.com/start/join?trk=brandpage_baidu_pc-mainlink")
web.find_element_by_xpath('/html/body/div[1]/main/p[1]/a').click()
time.sleep(2)
# 登录领英网
web.find_element_by_xpath('//*[@id="username"]').send_keys("xxxxxxxx")
web.find_element_by_xpath('//*[@id="password"]').send_keys("xxxxxxxxx")
web.find_element_by_xpath('/html/body/div/main/div[2]/div[1]/form/div[3]/button').click()
time.sleep(2)
# 到达爬取数据界面
web.find_element_by_xpath('//*[@id="ember24"]').click()
time.sleep(2)
# 获取页面初始高度
js = "return action=document.body.scrollHeight"
height = web.execute_script(js)
# 将滚动条调整至页面底部
web.execute_script('window.scrollTo(0, document.body.scrollHeight)')
time.sleep(5)
# 定义初始时间戳(秒)
t1 = int(time.time())
# 定义循环标识,用于终止while循环
status = True
# 重试次数
num = 0
while status:
# 获取当前时间戳(秒)
t2 = int(time.time())
# 判断时间初始时间戳和当前时间戳相差是否大于30秒,小于30秒则下拉滚动条
if t2 - t1 < 30:
new_height = web.execute_script(js)
if new_height > height:
time.sleep(1)
web.execute_script('window.scrollTo(0, document.body.scrollHeight)')
# 重置初始页面高度
height = new_height
# 重置初始时间戳,重新计时
t1 = int(time.time())
elif num < 3: # 当超过30秒页面高度仍然没有更新时,进入重试逻辑,重试3次,每次等待30秒
time.sleep(3)
num = num + 1
else: # 超时并超过重试次数,程序结束跳出循环,并认为页面已经加载完毕!
print("滚动条已经处于页面最下方!")
status = False
# 滚动条调整至页面顶部
web.execute_script('window.scrollTo(0, 0)')
break
# 爬取数据
# 查找存放数据的位置,进行数据提取
# 找到页面中存放数据的所有的li
li_list = web.find_elements_by_xpath('/html/body/div[6]/div[3]/div/section/section[3]/div/div/div[2]/ul/li')
time.sleep(2)
for li in li_list:
job_name = li.find_element_by_xpath('./section/div/a/div[1]/div[2]/div[1]/div').text
job_site = li.find_element_by_xpath('./section/div/a/div[1]/div[2]/div[3]/ul/li').text
try:
job_applicant=li.find_element_by_xpath('./section/div/a/ul/li[2]').text
except NoSuchElementException:
job_applicant=' '
try:
job_time=li.find_element_by_xpath('./section/div/a/ul/li[1]/time').text
except NoSuchElementException:
job_time=' '
# a2=li.find_element_by_xpath('./section/div/a/ul/li[1]/time')
# if a2==None:
# job_time='null'
# else:
# job_time=a2.text
print(job_name,job_site,job_applicant,job_time)
sum=[job_name,job_site,job_applicant,job_time]
Write_Excel().add_to_excel(values=sum)
time.sleep(3)
web.quit()
|