requests添加代理以及selenium的使用(淘宝绕过登录爬取数据)
requests添加代理
获取代理IP(蘑菇代理) — 会返回一个网址
def get_ip():
response = requests.get('返回的网址')
if(response.text[0] == '{'):
print('IP提取频率过快!10秒后再试试吧!')
return None
return [ip for ip in response.text.split('\n') if ip != '']
ips = get_ip()
proxies = {
'http':ips[0],
'https':ips[1]
}
添加代理
response = requests.get('http://www.gaoimg.com/photo/game/',headers = headers,proxies = proxies)
selenium的使用 - 安装driver
- Google浏览器键入chrome://version/,回车查看浏览器版本
- 得到版本号后进入http://npm.taobao.org/mirrors/chromedriver下载对应的执行文件并将其放至安装python的文件夹下。
- 若没有与版本信息相应的版本的执行文件则下载网页最后的与自己Google版本信息匹配度最高的TXT文件。文件中的版本号就是需要下载的。(Windows只有32位的下载就行)
from selenium.webdriver import Chrome
driver = Chrome()
driver.get('https://huaban.com/explore/hunsha-1')
运行以上程序若能打开浏览器则安装成功。
selenium的使用
from selenium.webdriver import Chrome
from selenium.webdriver.common.keys import Keys
- 创建浏览器对象
driver = Chrome()
- 打开网页
driver.get('https://www.51job.com')
- 获取标签
search = driver.find_element_by_id('kwdselectid')
- 输入框中输入内容
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)
- 获取网页内容
print(driver.page_source)
- 获取下一页按钮标签
next = driver.find_element_by_class_name('next')
next.click()
selenium的常用配置
from selenium.webdriver import Chrome,ChromeOptions
创建浏览器配置对象
options = ChromeOptions()
添加取消测试环境选项(清除网页上方自动控制显示)
options.add_experimental_option('excludeSwitches', ['enable-automation'])
取消图片加载(对于图片较多的网页图片的渲染只会影响加载速度)
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
设置代理
options.add_argument(f'--proxy-server=http://{代理}')
创建浏览器对象
driver = Chrome(options = options)
配置好后需要在创建浏览器对象时添加。
获取cookie
对于有登录拦截的网站如知乎、淘宝需要跳过登录
这里以淘宝为例
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
import time
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = Chrome(options=options)
def save_cookie(url = 'https://www.taobao.com/'):
driver.get(url)
search = driver.find_element_by_id('q')
search.send_keys('鞋子')
search.send_keys(Keys.ENTER)
time.sleep(15)
cookies = driver.get_cookies()
file = open('files/tb_cookies.txt','w',encoding='utf-8')
file.write(str(cookies))
淘宝爬取
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
options = ChromeOptions()
options.add_experimental_option("excludeSwitches", ['enable-automation', 'enable-logging'])
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})
driver = Chrome(options=options)
driver.get('https://www.taobao.com/')
cookies = eval(open('files/tb_cookies.txt','r',encoding='utf-8').read())
for cookie in cookies:
if(cookie['secure']):
driver.add_cookie(cookie)
driver.get('https://www.taobao.com/')
search = driver.find_element_by_id('q')
search.send_keys('鞋子')
search.send_keys(Keys.ENTER)
print(driver.page_source)
利用保存好的cookie完成免登录(绕过登录)。若登陆成功后出现休息拦截说明登录频繁。过一会儿再试就好了。休息拦截如下:
爬取淘宝源码:链接:https://pan.baidu.com/s/18Unb_k8YwHgspAOEXYN5oQ 提取码:gelv
51job岗位信息爬取
导包
from selenium.webdriver import Chrome,ChromeOptions
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import csv
import time
创建配置对象
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
创建浏览器对象
driver = Chrome(options=options)
driver.get('https://www.51job.com'
搜索岗位
search = driver.find_element_by_id('kwdselectid')
search.send_keys('数据分析')
search.send_keys(Keys.ENTER)
获取岗位信息
def get_messages():
soup = BeautifulSoup(driver.page_source,'lxml')
messages = soup.select('div.j_joblist>div')
jobs_names = []
company_names = []
salary = []
details = []
for message in messages:
jobs_names.append(message.select_one('.t>span').get_text())
company_names.append(message.select_one('.er>a').get_text())
if(message.select_one('.sal')):
salary.append(message.select_one('.sal').get_text())
else:
salary.append('工资面议')
details.append(message.select_one('.e>a').attrs['href'])
return zip(jobs_names,company_names,salary,details)
写入数据
def write_message(jobs):
file = open('files/jobs51/jobs.csv','a',encoding='utf-8')
writer = csv.writer(file)
writer.writerow(['工作岗位','公司名称','薪资','岗位详情'])
for job in jobs:
writer.writerow(job)
file.close()
翻页
def next_page():
time.sleep(1)
click_next = driver.find_element_by_class_name('next')
click_next.click()
主函数
index = 0
while(1):
try:
jobs = get_messages()
write_message(jobs)
index += 1
time.sleep(5)
next_page()
print(f'第{index}页数据写入完成!')
except:
print('所有数据写入完成!')
break
51job网站总页数打开查看后发现是176的静态数据,所以该循环也可以不用写成死循环。
爬取部分CSV文件截图:
|