基本思路: 1.根据输入的用户id组成完整的请求url地址(注意每次请求的地址不一样) 2.至少发起一次请求去获取用户的记录总页数(同时第一次请求也下载数据) 3.根据获取的总页数每次请求一页,解析出用户名,电影海报图片,电影名并下载至本地 4.每次下载时根据标记日期来判断是否已经将本年度的数据下载完,下载完但如果还继续执行下载则抛出异常
缺陷: 1.由于豆瓣不能直接筛选某一年度的电影,所有如果要下载较早的年度电影,会从最近的年份开始一一匹配,效率较低
import urllib.request
from lxml import etree
import os
def create_request(page,user_id):
url = f'https://movie.douban.com/people/{user_id}/collect?start={(page-1)*15}'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
request = urllib.request.Request(url=url,headers=headers)
return request
def get_content(request):
response = urllib.request.urlopen(request)
content = response.read().decode('utf-8')
return content
def down_load(content,page,year):
tree = etree.HTML(content)
user_name = tree.xpath('//div[@class="side-info-txt"]/h3/text()')[0]
movie_name_list = tree.xpath('//div[@class="grid-view"]/div/div[@class="info"]//em/text()')
movie_src_list = tree.xpath('//div[@class="grid-view"]//img/@src')
mark_date_list = tree.xpath('//span[@class="date"]/text()')
dirs = f'./{user_name}的{year}年度观影海报'
if not os.path.exists(dirs):
os.makedirs(dirs)
for i in range(len(movie_src_list)):
mark_date = str(mark_date_list[i])
if(int(mark_date.split('-')[0])<year):
raise Exception
elif (int(mark_date.split('-')[0]) == year):
url = movie_src_list[i]
name = movie_name_list[i].split('/')[0]
urllib.request.urlretrieve(url,dirs+'/'+name+'.jpg')
print(mark_date)
print(name,url)
if page == 1:
total_page = tree.xpath('//span[@class="thispage"]/@data-total-page')
return int(total_page[0]) if len(total_page)>0 else 0
if __name__ == '__main__':
year = int(input('输入需要爬取的年份:'))
user_id = input('输入需要爬取的用户id:')
page = 1
try:
request = create_request(page, user_id)
content = get_content(request)
total_page = down_load(content, page, year)
print('总记录页数',total_page)
if total_page == 0:
print('Error:当前用户无观影数据')
elif total_page > 1:
try:
for page in range(2, total_page + 1):
print(f'当前请求第{page}页')
request = create_request(page, user_id)
content = get_content(request)
down_load(content, page, year)
except Exception:
print("当前年度查找完毕")
except:
print("当前年度查找完毕")
|