此博客仅用于记录个人学习进度,学识浅薄,若有错误观点欢迎评论区指出。欢迎各位前来交流。(部分材料来源网络,若有侵权,立即删除) 本人博客所有文章纯属学习之用,不涉及商业利益。不合适引用,自当删除! 若被用于非法行为,与我本人无关
情况说明
- 简简单单爬取就好了
- 正则匹配去除空格和换行符
- 还有就是访问频率过高会被ban,timesleep一下,或者整一个代理ip池就好了
代码
from lxml import etree
import requests
import re
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
'Cookie':''
}
def get_sina_news_serach():
result=[]
for i in range(0,11):
url = 'https://maoyan.com/board/4?offset={}'.format(i*10)
rs = requests.session()
r = rs.get(url, headers=headers)
r.encoding = 'utf-8'
trees = etree.HTML(r.text)
for j in range(0,11):
data = []
name = trees.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[{}]/a/img[2]/@alt'.format(j))
actor = trees.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[{}]/div/div/div[1]/p[2]/text()'.format(j))
for Actor in actor:
Actor = re.sub('[ \n \\\ n \ n 。 \']]', '', Actor)
Actor = re.sub(' ', '', Actor)
Actor = re.sub('\n', '', Actor)
time = trees.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[{}]/div/div/div[1]/p[3]/text()'.format(j))
for Time in time:
Time = re.sub('[ \n \\\ n \ n 。 \']]', '', Time)
Time = re.sub(' ', '',Time)
score1 = trees.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[{}]/div/div/div[2]/p/i[1]/text()'.format(j))
score2 = trees.xpath('//*[@id="app"]/div/div/div[1]/dl/dd[{}]/div/div/div[2]/p/i[2]/text()'.format(j))
data.append(name[0])
data.append(Actor)
data.append(Time)
data.append(score1[0]+score2[0])
print(i*10+j,data)
time.sleep(3)
if __name__ == "__main__":
get_sina_news_serach()
|