[Python知识库] 爬虫-python -(12) 验证码 -selenium

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> Python知识库 -> 爬虫-python -(12) 验证码 -selenium -> 正文阅读

[Python知识库]爬虫-python -(12) 验证码 -selenium

文章目录

1.验证码转化

将图片验证码转化为字符串，有两种方式，其一为自己同图像处理识别文字，然后将字符串转化出，其二为让别人去转化，这样就需要上传至别人的网站，识别后将字符串返回给自己。
现在学习第二种方法，这样就需要一个网站，这里用的是超级鹰。
需要再超级鹰网站注册以及绑定微信免费领取1000积分。（因为让它给你识别验证码需要消费积分）
以上过程都走完了，现在需要下载超级鹰的python模板，就是调用超级鹰返回验证码的程序。这个网站上有，直接下载即可。需要简单读取下这个程序，将需要的输入的东西搞明白。
1-3分别对应超级鹰的账号、密码以及ID，ID要在账户内生成。
4-5分别为需要识别的验证码图片数据、验证码类型
、
在这里插入图片描述
2.实现自动登录超级鹰网站
登录界面与网站

验证码图片到超级鹰返回，中间代码不用将模块中代码复制过来，可以用from py import calss

'''
1.将网站的验证码下载
2.将验证码通过超级鹰找出然后填入到对应位置登录
'''
import time
import requests
from hashlib import md5
from selenium import webdriver
from chaojiying import Chaojiying_Client #从chaojiying.py导入类

def open_url(url,show):
    option = webdriver.ChromeOptions()
    # 防止打印一些无用的日志
    option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    if not show:
        option.add_argument('--headless')  #无头浏览器 不弹出浏览器 后台操作
        option.add_argument('--disbale--gpu')  
    web = webdriver.Chrome(options=option)
    web.get(url)
    return web
def download_code(web):
    web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img').click()
    time.sleep(0.1)
    img = web.find_element_by_xpath('/html/body/div[3]/div/div[3]/div[1]/form/div/img')
    data = img.screenshot_as_png
    return data
def web_login(web,user,serect,code):
    web.find_element_by_css_selector('[name = "user"]').send_keys(user)
    web.find_element_by_css_selector('[name = "pass"]').send_keys(serect)
    web.find_element_by_css_selector('[name = "imgtxt"]').send_keys(code)
    time.sleep(0.5)
    web.find_element_by_css_selector('[value = "登录"]').click()
    time.sleep(2)
    web.switch_to.window(web.window_handles[-1])  #奇幻至新的网站
    return web
def url_get_data(web):
    res = web.find_element_by_xpath('/html/body/div[3]/div[2]/div[1]/div[1]/span').text
    return res
if __name__ == '__main__':
    url= 'http://www.chaojiying.com/user/login/'
    chaojiying = Chaojiying_Client('账号', '密码', 'ID')	
    #1.打开网站
    web = open_url(url,True)   #False 为不显示窗口
    time.sleep(3)
    #2.将登录页面的验证码截屏
    im = download_code(web)  #不用下载到本地，可以给图片二进制代码
    #3.向超级鹰发送验证码图片，并返回验证码
    code_res = chaojiying.PostPic(im, 1902)['pic_str']	
    #4.输入账号密码和验证码 登录网站
    web = web_login(web,'账号', '密码', code_res)
    #5.从登录后网站获取账户积分
    res = url_get_data(web)
    print('账户积分：'+res)
    time.sleep(1)
    web.close()

2.52破解的注册-谷歌验证码

注册页面
由于12306现在没有图片点击验证码了，所以自己找了一个类似的，但是比那个要难，因为会出现2次或多次，现在只考虑产生一次的情况
要切到两个iframe里面，所以要先切进去，再切出来，再切到另外一个里面。

from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from chaojiying import Chaojiying_Client #从chaojiying.py导入类
from selenium.webdriver.common.action_chains import ActionChains

def open_url(url):
    option = Options()
    # 防止打印一些无用的日志
    option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    # 骗12306这不是chromedriver
    option.add_argument('--disable-blink-features=AutomationControlled')
    web = webdriver.Chrome(options=option)
    web.get(url)
    return web


if __name__ == '__main__':
    web = open_url('https://www.52pojie.cn/member.php?mod=logging&action=login')
    time.sleep(3)
    chaojiying = Chaojiying_Client('超级鹰账号', '超级鹰密码', 'ID')	
    iframe = web.find_element_by_xpath('//*[@id="seccode_cS"]/div/table/tbody/tr/td/div/div/div/iframe')
    web.switch_to.frame(iframe) #切换进iframe
    web.find_element_by_xpath('//*[@id="rc-anchor-container"]/div[4]').click()
    time.sleep(3)
    web.switch_to.default_content()#切换回主页面
    res= web.find_element_by_css_selector('[style ="z-index: 2000000000; position: relative; width: 400px; height: 580px;"]')
    iframes = res.find_elements_by_css_selector('[title= "reCAPTCHA 验证将于 2 分钟后过期"]')
    iframe =iframes[-1]
    web.switch_to.frame(iframe) #切换进iframe
    code_pic = web.find_element_by_xpath('//*[@id="rc-imageselect"]/div[2]')
    code_res = chaojiying.PostPic(code_pic.screenshot_as_png, 9004)['pic_str']
    print(code_res)
    time.sleep(2)
    #print(web.page_source)
    for i in code_res.split('|'):
        [x,y]= i.split(',')
        print([int(x),int(y)])
        ActionChains(web).move_to_element_with_offset(code_pic,int(x),int(y)).click().perform()
        time.sleep(1)
    web.find_element_by_xpath('//*[@id="recaptcha-verify-button"]').click()
    time.sleep(1000)

3.12306登录

12306现在没有了图片点击验证倒是让我感觉很意外。

from tkinter import Button
from selenium import webdriver
import time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
def open_url(url):
    option = Options()
    # 防止打印一些无用的日志
    option.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
    # 骗12306这不是chromedriver
    option.add_argument('--disable-blink-features=AutomationControlled')
    web = webdriver.Chrome(options=option)
    web.get(url)
    return web

def web_login(web,user,serect,code=''):
    web.find_element_by_css_selector('[id = "J-userName"]').send_keys(user)
    web.find_element_by_css_selector('[id = "J-password"]').send_keys(serect)
    #web.find_element_by_css_selector('[name = "imgtxt"]').send_keys(code)
    time.sleep(0.5)
    web.find_element_by_css_selector('[id = "J-login"]').click()
    time.sleep(2)
    #滑动验证
    button =web.find_element_by_xpath('//*[@id="nc_1__scale_text"]/span')
    webdriver.ActionChains(web).drag_and_drop_by_offset(button,350,0).perform()
    web.switch_to.window(web.window_handles[-1])  #切换至新的网站
    time.sleep(3)
    #防疫确定
    web.find_element_by_css_selector('[class = "btn btn-primary ok"]').click()
    
    return web

def get_web_data(web):
    res = web.find_element_by_xpath('//*[@id="js-minHeight"]/div[1]/div[1]').text
    res2 = web.find_element_by_xpath('//*[@id="js-minHeight"]/div[1]/div[2]').text
    return res,res2
    

if __name__=='__main__':
    url = 'https://kyfw.12306.cn/otn/resources/login.html'
    web = open_url(url)
    time.sleep(2)
    web = web_login(web,'12306账号','12306密码')
    time.sleep(2)
    res,res2 = get_web_data(web)
    print(res)
    print(res2)
    time.sleep(100)
    web.close()