Python爬虫实例2
爬取豆瓣电影TOP250(https://movie.douban.com/top250)的相关内容
step1 准备工作
目标:
爬取豆瓣电影TOP250的电影名称、豆瓣评分、评价数、电影概况、电影链接等
分析:
第一页URL:https://movie.douban.com/top250,展示了排行1-25的电影; 第二页URL:https://movie.douban.com/top250?start=25&filter=,展示了排行26-50的电影; … 获取TOP250,需要分开请求10次,参数start分别为:0,25…225
step2 获取数据
1.爬取第一页的源代码
import urllib.request
url = "https://movie.douban.com/top250"
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
print(html)
小插曲:
其中出现了“unable to get local issuer certificate (_ssl.c:1129)”错误,加入下方代码即可
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
2.提取信息
每部电影对应一个li节点
<li>
<div class="item">
<div class="pic">
<em class="">1</em>
<a href="https://movie.douban.com/subject/1292052/">
<img width="100" alt="肖申克的救赎" src="https://img2.doubanio.com/view/photo/s_ratio_poster/public/p480747492.jpg" class="">
</a>
</div>
<div class="info">
<div class="hd">
<a href="https://movie.douban.com/subject/1292052/" class="">
<span class="title">肖申克的救赎</span>
<span class="title"> / The Shawshank Redemption</span>
<span class="other"> / 月黑高飞(港) / 刺激1995(台)</span>
</a>
<span class="playable">[可播放]</span>
</div>
<div class="bd">
<p class="">
导演: 弗兰克·德拉邦特 Frank Darabont 主演: 蒂姆·罗宾斯 Tim Robbins /...<br>
1994 / 美国 / 犯罪 剧情
</p>
<div class="star">
<span class="rating5-t"></span>
<span class="rating_num" property="v:average">9.7</span>
<span property="v:best" content="10.0"></span>
<span>2466166人评价</span>
</div>
<p class="quote">
<span class="inq">希望让人自由。</span>
</p>
</div>
</div>
</div>
</li>
使用BeautifulSoup、re库匹配信息
from bs4 import BeautifulSoup
import re
findlink = re.compile(r'<a href="(.*?)">')
findImg = re.compile(r'<img.*?src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*?)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="item"):
data = []
item = str(item)
link = re.findall(findlink, item)[0]
data.append(link)
image = re.findall(findImg, item)[0]
data.append(image)
title = re.findall(findTitle, item)[0]
data.append(title)
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)[0]
data.append(inq)
bd = re.findall(findBd, item)[0]
data.append(bd)
print(data)
得到了第一页排名1-25的电影信息
分页爬取
给URL传入参数start=0,25…225
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
print(datalist)
在askURL函数添加了error
def askURL(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}
html = ''
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getData(baseurl):
datalist = []
for i in range(0, 10):
url = baseurl+str(i*25)
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="item"):
data = []
item = str(item)
link = re.findall(findlink, item)[0]
data.append(link)
image = re.findall(findImg, item)[0]
data.append(image)
title = re.findall(findTitle, item)[0]
data.append(title)
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
data.append(bd)
datalist.append(data)
return datalist
step3 保存数据
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("豆瓣电影TOP250")
col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景")
for i in range(0, 7):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 7):
sheet.write(i+1, j, data[j])
book.save(savepath)
结果: 爬取成功!!!
完整代码:
import xlwt
from bs4 import BeautifulSoup
import re
import ssl
import urllib.request
import urllib.error
ssl._create_default_https_context = ssl._create_unverified_context
findlink = re.compile(r'<a href="(.*?)">')
findImg = re.compile(r'<img.*?src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findInq = re.compile(r'<span class="inq">(.*?)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
def askURL(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.0 Safari/605.1.15'
}
html = ''
try:
request = urllib.request.Request(url=url, headers=headers)
response = urllib.request.urlopen(request)
html = response.read().decode('utf-8')
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def getData(baseurl):
datalist = []
for i in range(0, 10):
url = baseurl + str(i * 25)
html = askURL(url)
soup = BeautifulSoup(html, 'html.parser')
for item in soup.find_all('div', class_="item"):
data = []
item = str(item)
link = re.findall(findlink, item)[0]
data.append(link)
image = re.findall(findImg, item)[0]
data.append(image)
title = re.findall(findTitle, item)[0]
data.append(title)
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
data.append(inq)
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
data.append(bd)
datalist.append(data)
return datalist
def saveData(datalist, savepath):
book = xlwt.Workbook(encoding='utf-8', style_compression=0)
sheet = book.add_sheet("豆瓣电影TOP250")
col = ("电影链接", "图片链接", "电影名称", "电影评分", "评价人数", "评语", "背景")
for i in range(0, 7):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 7):
sheet.write(i+1, j, data[j])
book.save(savepath)
def main():
baseurl = "https://movie.douban.com/top250?start="
datalist = getData(baseurl)
savepath = "豆瓣电影TOP250.xls"
saveData(datalist, savepath)
if __name__ == '__main__':
main()
|