爬虫学习
爬取网站的url豆瓣排行 工具:vscode 1、首先安装requests 、lxml,在终端输入
pip install requests
pip install lxml
2、查看豆瓣页面源代码,输入电影名检查是否直接可以从源代码中获取 ctrl+f打开检查工具 输入“霸王别姬” 可以确定所有电影名可直接从源码获取
3、编写代码
url='https://movie.douban.com/top250'
hearders={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47'
}
resp = requests.get(url,headers=hearders)
tree=etree.HTML(resp.text)
the_first=tree.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
for i in the_first:
title=i.xpath('./div/div[1]/a/img/@alt')[0]
score=i.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0].strip()
comment=i.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0]
print(title)
print(score)
print(comment)
这里的xpath路径可以直接获取,然后根据需要自己编排 xpath获得内容方式,获取标签中内容直接在路径末尾加上 text(),若要获取标签属性,在末尾加上@xx
4、获取到所有数据后添加到数据库之中
def insert(value):
db = pymysql.connect(host='localhost',user='root',password='123456',database='python')
cursor = db.cursor()
sql = "INSERT INTO moviemessage(moviename,score,comment) VALUES (%s, %s, %s)"
try:
cursor.execute(sql,value)
db.commit()
print('插入数据成功')
except:
db.rollback()
print("插入数据失败")
db.close()
CREATE TABLE `moviemessage` (
`id` int NOT NULL AUTO_INCREMENT COMMENT 'id',
`moviename` varchar(255) NOT NULL COMMENT '电影名',
`score` double NOT NULL COMMENT '评分',
`comment` varchar(255) NOT NULL COMMENT '评论',
PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=26 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
所有代码:
import requests
from lxml import etree
import pymysql
def insert(value):
db = pymysql.connect(host='localhost',user='root',password='123456',database='python')
cursor = db.cursor()
sql = "INSERT INTO moviemessage(moviename,score,comment) VALUES (%s, %s, %s)"
try:
cursor.execute(sql,value)
db.commit()
print('插入数据成功')
except:
db.rollback()
print("插入数据失败")
db.close()
url='https://movie.douban.com/top250'
hearders={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36 Edg/94.0.992.47'
}
resp = requests.get(url,headers=hearders)
tree=etree.HTML(resp.text)
the_first=tree.xpath('/html/body/div[3]/div[1]/div/div[1]/ol/li')
for i in the_first:
title=i.xpath('./div/div[1]/a/img/@alt')[0]
score=i.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0].strip()
comment=i.xpath('./div/div[2]/div[2]/p[2]/span/text()')[0]
print(title)
print(score)
print(comment)
data = (title,score,comment)
insert(data)
|