selenium的简单操作

爬取拉钩招聘的简单信息1

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
import time

web = Chrome()

web.get("https://lagou.com")
#将位置定位在北京
se = web.find_element_by_xpath('//*[@id="changeCityBox"]/ul/li[1]/a')
se.click()
time.sleep(1)
#输入python并搜索
web.find_element_by_xpath('//*[@id="search_input"]').send_keys('python',Keys.ENTER)

lis = web.find_elements_by_xpath('//*[@id="jobList"]/div[1]/div')
for li in lis:
	#爬取想要的内容
    company = li.find_element_by_xpath('./div/div[2]/div/a').text
    info = li.find_element_by_xpath('./div[1]/div[1]/div[2]').text
    print(company, info)

爬取拉钩招聘的简单信息2

from  selenium.webdriver import Chrome
import time
from selenium.webdriver.common.keys import Keys

web = Chrome()
#进入网站
web.get("http://www.lagou.com")
time.sleep(1)
#关掉城市选择
web.find_element_by_xpath('//*[@id="cboxClose"]').click()
time.sleep(1)
#输入python并搜索
web.find_element_by_xpath('//*[@id="search_input"]').send_keys('python',Keys.ENTER)
time.sleep(1)
#点击链接
web.find_element_by_xpath('//*[@id="jobList"]/div[1]/div[1]/div[1]/div[1]/div[1]/a').click()
time.sleep(1)
#视角转换到最后一个页面
web.switch_to.window(web.window_handles[-1])
time.sleep(1)
#获取需要的信息
info  = web.find_element_by_xpath('//*[@id="job_detail"]/dd[2]').text
time.sleep(1)
print(info)
#关闭该页面
web.close()
#回到最开始的那个页面
web.switch_to.window(web.window_handles[0])

无头浏览器及下拉列表

获取艺恩历年排行榜

from selenium.webdriver import Chrome
from selenium.webdriver.support.select import Select
import time
from selenium.webdriver.chrome.options import Options

#不加载浏览器
opt = Options()
opt.add_argument('--headless')
opt.add_argument('--disable-gpu')

#选择的浏览器以及网址
web = Chrome(options=opt)
web.get('https://www.endata.com.cn/BoxOffice/BO/Year/index.html')

#定位到下拉菜单
ss = web.find_element_by_xpath('//*[@id="OptionDate"]')

#对下拉菜单进行包装
sels = Select(ss)

#打印内容
#按索引进行打印
# for option in range(len(sels.options)):
#     sels.select_by_index(option)#按索引进行切换
#     time.sleep(1)
#     table = web.find_element_by_xpath('//*[@id="TableList"]')#打印对应索引的表格
#     print(table.text)
#     print("=============================")


#按显示的内容进行打印
# sels.select_by_visible_text('2021年')
# time.sleep(1)
# table = web.find_element_by_xpath('//*[@id="TableList"]')#打印对应索引的表格
# print(table.text)


#按照value值进行打印
# sels.select_by_value('2021')
# time.sleep(1)
# table = web.find_element_by_xpath('//*[@id="TableList"]')#打印对应索引的表格
# print(table.text)


#拿到经过经过数据加载以及js后的网页源码
# time.sleep(2)
# print(web.page_source)

通过工具超级鹰来进行验证码的处理

chaojiying.py

此文件文下面各个文件导入的chaojiying文件的源码，可以通过超级鹰获得
不同类型的验证码用不同的数字代表验证码类型

import requests
from hashlib import md5

class Chaojiying_Client(object):

    def __init__(self, username, password, soft_id):
        self.username = username
        password =  password.encode('utf8')
        self.password = md5(password).hexdigest()
        self.soft_id = soft_id
        self.base_params = {
            'user': self.username,
            'pass2': self.password,
            'softid': self.soft_id,
        }
        self.headers = {
            'Connection': 'Keep-Alive',
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
        }

    def PostPic(self, im, codetype):
        """
        im: 图片字节
        codetype: 题目类型 参考 http://www.chaojiying.com/price.html
        """
        params = {
            'codetype': codetype,
        }
        params.update(self.base_params)
        files = {'userfile': ('ccc.jpg', im)}
        r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
        return r.json()

    def ReportError(self, im_id):
        """
        im_id:报错题目的图片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()


if __name__ == '__main__':
	chaojiying = Chaojiying_Client('用户名', '密码', '软件ID')	#用户中心>>软件ID 生成一个替换 96001
	im = open('a.jpg', 'rb').read()								#本地图片文件路径 来替换 a.jpg 有时WIN系统须要//
	print (chaojiying.PostPic(im, 1902))

超级鹰干掉超级鹰

from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
from chaojiying import Chaojiying_Client

web = Chrome()
web.get('https://www.chaojiying.com/user/login/')

#                                XPath                                              将验证码的图片以截屏的形式截取下来
img = web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img').screenshot_as_png

#超级鹰的  用户名   密码    软件ID
chaojiying = Chaojiying_Client('用户名', '密码', '软件ID') 

#分析 img 获取验证码      1902代表验证码的类型
dic = chaojiying.PostPic(img, 1902)
verify_code = dic['pic_str']

#输入相关信息进行登录
web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[1]/input').send_keys('用户名')
web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[2]/input').send_keys('密码')
web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[3]/input').send_keys(verify_code)
web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/p[4]/input').send_keys(Keys.ENTER)

网站检测到使用自动化工具打开

使用自动化工具打开
在这里插入图片描述
未使用自动化工具打开

解决方法

对于chrome来说

1.chrome的版本号如果小于88  在你启动浏览器的时候(此时没有加载任何网页内容), 向页面嵌入js代码. 去掉webdriver

web = Chrome()
web.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
  "source": """
   navigator.webdriver = undefined
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    })
  """
})
web.get(xxxxxxx)


2.chrome的版本大于等于88
option = Options()
# option.add_experimental_option('excludeSwitches', ['enable-automation'])#可有可无
option.add_argument('--disable-blink-features=AutomationControlled')
web = Chrome(options=option)
web.get("xxxxxxxxxxxxxxxxxxxxx")

登录中国铁路12306

from selenium.webdriver import Chrome
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.action_chains import ActionChains
from chaojiying import Chaojiying_Client
import time

option = Options()
# option.add_experimental_option('excludeSwitches', ['enable-automation'])
option.add_argument('--disable-blink-features=AutomationControlled')

web = Chrome()

web.get('https://kyfw.12306.cn/otn/resources/login.html')
web.find_element_by_xpath('//*[@id="J-userName"]').send_keys('用户名')
time.sleep(1)
web.find_element_by_xpath('//*[@id="J-password"]').send_keys('密码')
time.sleep(1)

# 用超级鹰去识别图形验证码                         图形验证码的xpath
#verify_img_element = web.find_element_by_xpath('//*[@id="J-loginImg"]')
#对上面获得的内容进行识别
# dic = chaojiying.PostPic(verify_img_element.screenshot_as_png, 9004)
#得到的结果是  # x1,y1|x2,y2|x3,y3
# result = dic['pic_str']
# rs_list = result.split("|")  #结果为['x1,y1','x2,y2','x3,y3']
# for rs in rs_list:  # x1,y1
#     p_temp = rs.split(",")
#     x = int(p_temp[0])
#     y = int(p_temp[1])
#     #让鼠标移动到指定位置，然后进行点击，  原点坐标为图片的左上角
#     ActionChains(web).move_to_element_with_offset(verify_img_element, x, y).click().perform()

#点击登录
web.find_element_by_xpath('//*[@id="J-login"]').click()
time.sleep(1)

#滑块处理                        滑块验证码的xpath
btn = web.find_element_by_xpath('//*[@id="nc_1_n1z"]')
#x表示进行左右移动，大于0左移。y表示上下移动       x,y
ActionChains(web).drag_and_drop_by_offset(btn,300,0).perform()