可以自动读取验证码的‘超级鹰’接口配置(须先去网站注册)。将下文保存为 .py 文件再在爬取过程中对它进行调用。
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
self.password = md5(password.encode("utf-8")).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: 图片字节
codetype: 题目类型 参考 http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:报错题目的图片ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
爬取过程
from CodeClass import Chaojiying_Client
import requests
from lxml import etree
def getCodeText(imgPath, codeType):
chaojiying = Chaojiying_Client('自己的账号', '账号对应的密码', codeType)
im = open(imgPath, 'rb').read()
print(chaojiying.PostPic(im, codeType)['pic_str'])
return (chaojiying.PostPic(im, codeType)['pic_str'])
session = requests.Session()
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:84.0) Gecko/20100101 Firefox/84.0"
}
url = 'http://www.renren.com/SysHome.do'
page_text = requests.get(url=url, headers=headers).text
tree = etree.HTML(page_text)
code_img_src = tree.xpath('//*[@id="verifyPic_login"]/@src')[0]
code_img_data = requests.get(url=code_img_src, headers=headers).content
with open('./code.jpg', 'wb') as fp:
fp.write(code_img_data)
result = getCodeText('code.jpg', '1902')
login_url = 'http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2021041722428'
data = {
'email': '自己的email',
'icode': result,
'origURL': 'http://www.renren.com/home',
'domain': 'renren.com',
'key_id': '1',
'captcha_type': 'web_login',
'password': '自己的网页密码',
'rkey': '2def5d84381e7889d5a3035d83561d72',
'f': 'http%3A%2F%2Fsc.renren.com%2F',
}
response = session.post(url=login_url, headers=headers, data=data)
print(response.status_code)
detail_url = 'http://www.renren.com/975729432/profile'
detail_page_text = session.get(url=detail_url, headers=headers).text
with open('zhuazei.html', 'w', encoding='utf-8') as fp:
fp.write(detail_page_text)
|