'''
爬取https://maoyan.com/board/4?offset=0 电影名,主演,上映时间,评分
需要useragent 不能爬取太快
'''
import spider_tool
import requests
from lxml import etree
import pymysql
import time
def get_html(url):
'''
得到页面的返回
:param url:
:return:
'''
t = spider_tool.Tool()
ua = t.generate_user_agent()
headers = {
"User-Agent": ua
}
print(headers)
res = requests.get(url,headers=headers)
res.encoding = res.apparent_encoding
return res
def get_data(res):
'''
获取具体数据
:param res:
:return:
'''
print(res.status_code)
print("分析")
html = etree.HTML(res.text)
name_list = html.xpath("//p[@class='name']/a/@title")
star_list = [i.strip() for i in html.xpath("//p[@class='star']/text()")]
releasetime_list = html.xpath("//p[@class='releasetime']/text()")
score_list = [i.xpath('string(.)') for i in html.xpath("//p[@class='score']")]
for name,star,releasetime,score in zip(name_list,star_list,releasetime_list,score_list):
print(name)
print(star)
print(releasetime)
print(score)
save_to_mysql(name,star,releasetime,score)
def save_to_mysql(*args):
'''
存入数据库中
:param args:
:return:
'''
print(args)
sql = 'insert into movies_info(name,star,time,score) values(%s,%s,%s,%s)'
try:
cursor.execute(sql, args)
except Exception as e:
print(e)
conn.rollback()
else:
print("存入数据库")
conn.commit()
if __name__ == '__main__':
conn = pymysql.Connect(
host='localhost',
user='root',
password='123456',
database='testdata',
charset='utf8',
port=3306
)
cursor = conn.cursor()
for i in range(10):
url = 'https://maoyan.com/board/4?offset=%s'%(i*10)
print(url)
time.sleep(2)
get_data(get_html(url))
cursor.close()
conn.close()
|