import bs4
import requests
def openURL(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0'
}
return requests.get(url, headers=headers)
def findMovies(res):
soup = bs4.BeautifulSoup(res.text, "html.parser")
movies = []
targets = soup.find_all("div", class_="hd")
for each in targets:
print(each.a.span.text)
movies.append(each.a.span.text)
ranks = []
targets = soup.find_all("span", class_="rating_num")
for each in targets:
ranks.append("评分:%s" % each.text)
messages = []
targets = soup.find_all("div", class_="bd")
for each in targets:
try:
messages.append(each.p.text.split("\n")[1].strip() +
each.p.text.split("\n")[2].strip())
except:
continue
result = []
length = len(movies)
for i in range(length):
result.append(movies[i] + ranks[i] + messages[i] + "\n")
return result
def findDepth(res):
soup = bs4.BeautifulSoup(res.text, "html.parser")
depth = soup.find("span", class_="next").previous_sibling.previous_sibling.text
return int(depth)
def main():
host = "https://movie.douban.com/top250"
res = openURL(host)
depth = findDepth(res)
result = []
for i in range(depth):
url = host + "/?start=" + str(25 * i)
res = openURL(url)
result.extend(findMovies(res))
with open("top250.txt", "w", encoding="utf-8") as f:
for each in result:
f.write(each)
if __name__ == "__main__":
main()
|