1.添加请求头,防止被拦截
dic = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.
106 Safari/537.36"
}
2.预编译正则表达式,用于过滤无用信息
# cinema_name:电影名;Release_date:上映日期;
# grade:评分 ;Evaluation_number:评价人数;筛选出如下四个数据;
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<cinema_name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<Release_date>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<grade>.*?)</span>.*?'
r'<span>(?P<Evaluation_number>.*?)</span>', re.S)
3.获取网页内容
250条数据一共有十页,所以需要获取十页网页的内容
# 获取当前页数为page的网页对象并返回,这个对象包含了该网页的所有数据
def get_web_content(page):
url = 'https://movie.douban.com/top250?start=%d&filter='%(page)
web_content = requests.get(url, headers=dic)
return web_content
# 合并十页的内容
web_content_all = get_web_content(0).text
for i in range(1,10):
web_content_all = web_content_all + get_web_content(i*25).text
4.对获取的内容进行过滤
select_information = obj.finditer(web_content_all)
5.保存数据
# 新建一个csv文件用于保存数据
# newline="":csv文件去掉多余换行
Data_file = open("data.csv", mode="w", newline="")
csv_writer = csv.writer(Data_file)
# 将抓取的信息转换为字典存储并且写入csv文件
for i in select_information:
dic = i.groupdict()
dic['Release_date'] = dic['Release_date'].strip()
csv_writer.writerow(dic.values())
Data_file.close()
print("over!")
6.完整代码
import requests
import re
import csv
# 添加请求头,防止被拦截
dic = {
"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36"
}
# 获取当前页数为page的网页对象并返回,这个对象包含了该网页的所有数据
def get_web_content(page):
url = 'https://movie.douban.com/top250?start=%d&filter='%(page)
web_content = requests.get(url, headers=dic)
return web_content
# compile:预编译正则表达式,可以重复使用
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<cinema_name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<Release_date>.*?) .*?'
r'<span class="rating_num" property="v:average">(?P<grade>.*?)</span>.*?'
r'<span>(?P<Evaluation_number>.*?)</span>', re.S)
# 获取10页共250数据网页内容text
web_content_all = get_web_content(0).text
for i in range(1,10):
web_content_all = web_content_all + get_web_content(i*25).text
# 开始匹配
select_information = obj.finditer(web_content_all)
# 新建一个csv文件用于保存数据
# newline="":csv文件去掉多余换行
Data_file = open("data.csv", mode="w", newline="")
csv_writer = csv.writer(Data_file)
# 将抓取的信息转换为字典存储并且写入csv文件
for i in select_information:
dic = i.groupdict()
dic['Release_date'] = dic['Release_date'].strip()
csv_writer.writerow(dic.values())
Data_file.close()
print("over!")
偷偷地问一下,写这种博客违规嘛?
|