import requests,re,json
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
ws = wb.active
ws.append(['电影名称'])
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
}
base_url = 'https://movie.douban.com/chart'
movies_count_url= 'https://movie.douban.com/j/chart/top_list_count?type={}&interval_id=100%3A90'
movies_url = 'https://movie.douban.com/j/chart/top_list?type={}&interval_id=100%3A90&action=&start=0&limit={}'
response = requests.get(url=base_url,headers=headers)
html = etree.HTML(response.text)
span_list = html.xpath('//div[@class="types"]/span')
for span in span_list:
big_title = span.xpath('./a/text()')[0]
types_id = span.xpath('./a/@href')[0]
type_id = re.findall('.*?type=(\d+).*?',types_id)[0]
# print(big_title,type_id)
resp = requests.get(url=movies_count_url.format(type_id),headers=headers)
dict_data = json.loads(resp.text)
movies_count = dict_data['total']
# print(movies_count)
res = requests.get(url=movies_url.format(type_id,movies_count),headers=headers)
movies_list = json.loads(res.text)
print(big_title)
for movies in movies_list:
lis = []
movies_name = movies['title']
lis.append(movies_name)
ws.append(lis)
wb.save('./data/豆瓣电影大全.xlsx')
|