20220624 登录和代理ip
reques的登录反爬
python from selenium.webdriver import Chrome, ChromeOptions
#1. 创建配置对象
options = ChromeOptions()
#1)取消测试环境
options.add_experimental_option(‘excludeSwitches’, [‘enable-automation’])
#2)取消图片加载
options.add_experimental_option(“prefs”, {“profile.managed_default_content_settings.images”: 2})
b = Chrome(options=options)
b.get(‘https://www.jd.com’)
input(‘end:’) b.close()
selenium获取cookies
from selenium.webdriver import Chrome from json import dumps
#1. 打开需要做自动登录的网站 b = Chrome() b.get(‘https://www.51job.com/’)
#2. 提供足够长的时间让人工在这个页面中完成登录(登录后一定要保证b对应的窗口出现登录信息) input(‘登录完成:’)
#3. 获取登录后的cookie信息保存到本地文件中(建议保存一个json) cookies = b.get_cookies()
with open(‘files/taobao.json’, ‘w’, encoding=‘utf-8’) as f: f.write(dumps(cookies))
b.close()
selenium使用cookies
python from selenium.webdriver import Chrome from json import loads
#1. 打开需要爬取的网站 b = Chrome() b.get(‘https://www.taobao.com’)
#2.从cookie文件中获取cookie信息并且添加到浏览器对象中 with open(‘files/taobao.json’, encoding=‘utf-8’) as f: cookies = loads(f.read())
for x in cookies: b.add_cookie(x)
#3.重新打开网页 b.get(‘https://www.taobao.com’)
input(‘end:’) b.close()
requests使用代理ip
python import requests
headers = { ‘user-agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36’ }
#创建代理对象 #proxies = {
‘https’: ‘http://183.165.224.25:4554’,
‘http’: ‘http://183.165.224.25:4554’
#} proxies = { ‘https’: ‘183.165.224.25:4554’, ‘http’: ‘183.165.224.25:4554’ } #发送请求的时候使用代理 response = requests.get(‘https://www.maoyan.com/’, headers=headers, proxies=proxies) #解决乱码问题 response.encoding = ‘utf-8’ print(response.text)
代理ip的使用方法
python import requests import time from bs4 import BeautifulSoup
def get_ip(): “”" 获取代理ip,如果获取失败过2秒再重新获取 :return: 获取到的ip地址 “”" while True: response = requests.get(‘http://d.jghttp.alicloudecs.com/getip?num=1&type=1&pro=510000&city=510600&yys=0&port=1&time=2&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions=’) result = response.text if result[0] == ‘{’: print(‘ip获取失败’) time.sleep(2) else: return result
def get_net_data(): url = ‘https://www.maoyan.com/’ headers = { ‘user-agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36’ }
#使用代理ip发送请求,如果代理失败,重新获取新的ip重新再发送请求
while True:
ip = get_ip()
print(ip)
proxy = {
'https': ip
}
response = requests.get(url, headers=headers, proxies=proxy)
response.encoding = 'utf-8'
print(response.text)
soup = BeautifulSoup(response.text, 'lxml')
movies_div = soup.select('.movie-list .movie-item')
if len(movies_div) == 0:
continue
else:
print('爬虫成功!做后续的解析操作')
break
if name == ‘main’: get_net_data()
selenium使用代理ip
python from selenium.webdriver import Chrome, ChromeOptions
options = ChromeOptions() options.add_argument(‘–proxy-server=http://115.208.231.37:4545’)
b = Chrome(options=options) b.get(‘https://www.maoyan.com/’)
print(b.page_source)
input(‘end:’) b.close()
|