所需的库:re,time(防止猫眼检测成异常ip),requests
import time
import requests
import re
def run():
url="https://www.maoyan.com/board/4?requestCode=3712207890bccddba4b4ca833e26c8e554rgr&offset={}"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.93 Safari/537.36"}
num=0
temp=1 #为了验证是否在跳转爬取信息
while num<=90:
response=requests.get(url=url.format(num),headers=headers)
result=response.text
obj=re.compile(r'<p class="name"><a href=".*?" title="(.*?)" .*?">.*?</a></p>',re.S)
list_result=obj.findall(result)
print(temp)
for i in list_result:
print(i,type(i))
with open("maoyan_movie.txt",'a') as f:
f.write(i)
f.write('\n')
time.sleep(1)
num=num+10
temp=temp+1
run()
|