import time
import codecs
import requests
import lxml.html
with codecs.open("movies.txt", "w", "utf-8") as f:
myheaders = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
url_tpl = "https://movie.douban.com/top250?start={}&filter="
for page in range(10):
print("get page {}".format(page+1))
start = page * 25
url = url_tpl.format(start)
http_response = requests.get(url, headers=myheaders)
http_response.encoding = "utf-8"
html = lxml.html.fromstring(http_response.text)
movies = html.xpath('//*[@id="content"]/div/div[1]/ol/li')
for movie in movies:
movie_text = str(movie.text_content())
clean_movie_text = movie_text.replace("\n", "")
print(clean_movie_text, file=f)
time.sleep(5)
运行过程
由于豆瓣TOP250的电影网页每一页只展示25个电影,因此需要爬取共10页。 完成后会在py代码的共同目录下生成一个movies.txt文档。 文档里面就是TOP250的详细信息了:
|