刚学 requests 做个小练习
爬取内容是:某电影网站,正在热映的电影榜单
具体是:电影名称,评分,主演,上映时间,及电影海报链接(网站缩略图)
注意事项: 因为网站的反爬,多次访问后会有图形验证码,所以会爬取不到数据.
解决方法: 只需要用浏览器刷新页面,通过验证码(直接在代码中过验证的方式还没学会…)
import time
import requests
import random
from lxml import etree
class Mymovie(object):
def __init__(self, url):
self.url = 'https://maoyan.com/films?offset={}'.format(url)
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
self.movie_info = []
def movie_get(self,url):
time.sleep(random.uniform(1, 3))
get_your = requests.get(url,headers=self.headers)
get_content = get_your.content.decode('utf-8')
return get_content
def screen_html(self,html):
get_html = etree.HTML(html)
movie_dict = {}
for li in get_html:
movie_dict['title'] = li.xpath('//div[@class="channel-detail movie-item-title"]//a[@data-act="movies-click" ]/text()')
movie_dict['score'] = li.xpath('//div[ @class="channel-detail channel-detail-orange"]')
movie_dict['type'] = li.xpath('//div[@class="movie-item-hover"]// div[@class="movie-hover-title"][2]/text()')[1::2]
movie_dict['star'] = li.xpath('//div[@class="movie-item-hover"]// div[@class="movie-hover-title"][3]/text()')[1::2]
movie_dict['show_time'] = li.xpath('//div[@class="movie-item-hover"]// div[@class="movie-hover-title movie-hover-brief"]/text()')[1::2]
movie_dict['src_url'] = li.xpath('//div[@class="movie-item-hover"]//img[@class="movie-hover-img"]/@src')
self.movie_info.append(movie_dict)
print(self.movie_info)
def write_movie(self):
with open('Mymovie.txt', 'ab') as f:
for j in range(30):
f.write('\n电影名称:'.encode()+self.movie_info[0]['title'][j].encode())
movie_num = self.movie_info[0]['score'][j].xpath("string(.)").encode()
f.write('\n电影评分:'.encode()+movie_num,)
movie_type = self.movie_info[0]['type'][j].encode()
f.write('\n电影类型:'.encode()+movie_type.strip())
movie_star = self.movie_info[0]['star'][j].encode()
f.write('\n主演:'.encode()+movie_star.strip())
movie_time = self.movie_info[0]['show_time'][j].encode()
f.write('\n上映时间:'.encode()+movie_time.strip())
f.write('\n海报链接:'.encode()+self.movie_info[0]['src_url'][j].encode()+'\n\n'.encode())
print('电影信息保存成功!')
def star_movie(self):
html = self.movie_get(self.url)
self.screen_html(html)
self.write_movie()
if __name__ == '__main__':
for i in range(3):
url_name = i * 30
movie = Mymovie(url_name)
movie.star_movie()
----当然这只是为了学习----
|