Python爬虫之requests和bs4
1. requests使用方法
1.1 发送请求
requests:Python基于http协议进行网络请求的第三方库
requests.get(url,,headers) - 发送get请求 requests.post(url,,headers) - 发送post请求
参数: url - 请求地址(一个网站的网址,接口的地址,图片地址等) headers - 设置请求头(设置cookie和User-Agent的时候用) params - 设置参数 proxies - 设置代理
发送get请求,参数直接拼接到url中
requests.get(‘http://api.tianapi.com/auto/index?key=c9d408fefd8ed4081a9079d0d6165d43&num=10’)
发送post请求,参数设置在params中
params={
'key':'c9d408fefd8ed4081a9079d0d6165d43',
'num':10
}
requests.post('http://api.tianapi.com/auto/index',params=params)
1.2 获取响应信息
response=requests.get(‘http://www.yingjiesheng.com/’)
设置编码方式(乱码的时候才需要)
response.encoding=‘GBK’
获取响应头
print(response.headers)
获取响应体
a.获取text值(用于请求网页,直接拿到网页源代码)
print(response.text)
b.获取json解析结果(用于返回json数据的数据接口)
print(response.json())
c.获取content值(获取二进制类型的原数据,用于图片、视频、音频的下载)
print(response.content)
2.添加请求头
2.1 只添加User-Agent
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
response=requests.get('https://www.51job.com/',headers=headers)
response.encoding='gbk'
print(response.text)
2.2 同时添加User-Agent和cookie
headers={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36',
'cookie':'自己的cookie码'
}
response=requests.get('https://www.zhihu.com/',headers=headers)
print(response.text)
3.json解析
import requests
response=requests.get('https://www.toutiao.com/hot-event/hot-board/?origin=toutiao_pc&_signature=_02B4Z6wo00f01k.O6AwAAIDDZESzymP6Zp5P6uyAAPLvqanBbyCJEJJP8E.2Pol60fAraR8Zuvkny9gsdVRqamqSqAjbC0WRO65XKkqkN3dsrKIyrPCcZVn40kghH6SLPb-hGdDuVDqJI1b6c8')
all_news=response.json()['data']
for news in all_news:
print(news['Title'])
print(news['Image'],['Url'])
4.图片下载
4.1 单图下载
import requests
def download_image(img_url:str):
response=requests.get(img_url)
data=response.content
f=open(f'files/{img_url.split("/")[-1]}','wb')
f.write(data)
if __name__ == '__main__':
download_image('https://p5.toutiaoimg.com/img/pgc-image/9f5d102756354b6db8fa9408c57d01c8~cs_noop.png')
4.2 多图下载
import requests
from re import findall
def download_image(img_url:str):
response=requests.get(img_url)
data=response.content
f=open(f'files/{img_url.split("/")[-1].split("!")[0]}','wb')
f.write(data)
if __name__ == '__main__':
response=requests.get('https://www.58pic.com/tupian/qixi-0-0.html')
result = findall(r'(?s)<img src="(\S+?)">',response.text)
for x in result:
download_image(f'https:{x}')
5.bs4的使用
5.1 准备需要解析的网页数据(实际是用request或者selenium获取)
data= open('test2.html',encoding='utf-8').read()
5.2 创建BeautifulSoup对象(可以自动纠正数据中一些HTML的错误)
BeautifulSoup(数据,解析器)
soup=BeautifulSoup(data,'lxml')
5.3 通过BeautifulSoup对象获取标签和标签内容
5.3.1 获取标签
BeautifulSoup对象.select(css选择器) - 获取css选择器选中的所有标签,返回的是列表,列表中的元素是选中的标签对象
BeautifulSoup对象.select_one(css选择器) - 获取css选择器选中的第一个标签:返回的是标签对象
result = soup.select('p')
print(result)
result = soup.select_one('p')
print(result)
result=soup.select('#p1')
print(result)
result=soup.select('div p')
print(result)
5.3.2 获取标签内容
a.标签对象.string - 获取标签中的文字内容(只有在标签内容是纯文字的情况下有效,否则结果是None)
p2=soup.select_one('div>p')
print(p2.string)
s1=soup.select_one('#s1')
print(s1.string)
b.标签对象.get_text() - 获取标签内容中所有的文字信息
print(p2.get_text())
print(s1.get_text())
c.标签对象.contents
print(p2.contents)
result=s1.contents
print(result)
print(result[-1].get_text())
5.3.3 获取标签属性
a1=soup.select_one('div>a')
print(a1)
print(a1.attrs['href'])
img1=soup.select_one('img')
print(img1)
print(img1.attrs['src'])
补充
BeautifulSoup对象.select/select_one(css选择器) - 在整个网页中获取css选择器选中的标签
标签对象.select/select_one(css选择器) - 在指定标签中获取css选择器选中的标签
练习
import requests
from bs4 import BeautifulSoup
import csv
f = open('files/豆瓣TOP250.csv', 'w', encoding='utf_8')
writer = csv.writer(f)
writer.writerow(['封面','电影名','评价','评分','评论人数'])
i=0
def traverse(all_movie_li,i):
for li in all_movie_li:
list_movie=[]
img_url = li.select_one('.pic>a>img').attrs['src']
list_movie.append(img_url)
print(img_url)
name = li.select_one('.title').get_text()
list_movie.append(name)
print(name)
try:
des = li.select_one('.inq').get_text()
list_movie.append(des)
print(des)
except AttributeError:
list_movie.append('无')
print('无')
score = li.select_one('.rating_num').get_text()
list_movie.append(score)
print(score)
num_people=li.select_one('.info > div.bd > div > span:nth-child(4)').get_text()
list_movie.append(num_people)
print(num_people)
writer.writerow(list_movie)
i+=1
print('--------------------------------------',i)
return i
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36'
}
response=requests.get('https://movie.douban.com/top250',headers=headers)
soup=BeautifulSoup(response.text,'lxml')
all_movie_li=soup.select('#content > div > div.article > ol> li ')
print(all_movie_li)
i=traverse(all_movie_li,i)
all_movie_a=soup.select('#content > div > div.article > div.paginator > a')
for x in all_movie_a:
all_movie_http=x.attrs['href']
response = requests.get(f'https://movie.douban.com/top250{all_movie_http}', headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
all_movie_li = soup.select('#content > div > div.article > ol> li ')
i=traverse(all_movie_li,i)
|