通过python的urlib和re模块提取豆瓣top250的电影的排行,电影名字,导演名字,电影评分的信息。
from urllib.request import Request,urlopen
import re
def get_page(url):
r = Request(url=url,
headers={
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'})
response = urlopen(r)
return response.read().decode('utf-8')
def get_infomation(s):
obj = re.compile(r'<div class="item">.*?<em class="">(?P<排名>.*?)</em>'
r'.*?<span class="title">(?P<电影名>.*?)</span>'
r'.*?导演: (?P<导演>.*?) '
r'.*?<span class="rating_num" property="v:average">(?P<评分>.*?)</span>'
, re.S)
result = obj.finditer(s)
lst = []
for item in result:
lst.append(item.groupdict())
return lst
def main():
result = []
for i in range(10):
s = get_page(f'https://movie.douban.com/top250?start={i * 25}')
result += get_infomation(s)
with open("test.txt", mode='a', encoding='utf-8') as fp:
for item in result:
fp.write(str(item))
fp.write('\n')
lst = []
with open("test.txt", mode='r', encoding='utf-8') as fp:
for line in fp:
k = line.replace('{', "")
k = k.replace("}", "")
k = k.replace("'", "")
lst.append(k)
with open("test.txt", mode='w', encoding='utf-8') as fp:
for item in lst:
fp.write(item)
main()
最后的效果:
|