|
~满满干货分享
一、豆瓣top250爬取过程
(学习视频:Python爬虫编程基础5天速成(2021全新合集)Python入门+数据分析_哔哩哔哩_bilibili)
1、爬取网页
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
}
req = urllib.request.Request(url, headers=head)
html=""
try:
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return (html)
注意要点:(1)参数:指的是爬取的网页链接
? ? ? ? ? ? ? ? ? ?(2)head:伪装身份,伪装成豆瓣链接可识别的浏览器
2、获取数据--利用正则表达式得到爬取数据的主要内容,然后利用BeautifulSoup解析数据?
def getData(baseurl):
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url)
#2、逐一解析数据
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
data=[]
item=str(item)
link=re.findall(findlink,item)[0] #获取影片的超链接
data.append(link) #添加连接
imgSrc=re.findall(findImgSrc,item)[0]
data.append(imgSrc) #添加图片
titles=re.findall(findTitle,item) #片名可能只有一个中文名,没有外国名
if (len(titles)==2):
ctitle=titles[0]
data.append(ctitle) #添加中文名
otitle=titles[1].replace("/","")#将中文名字和美国名字之间的/去掉
data.append(otitle) #添加外国名
else:
data.append(titles[0])
data.append(' ')
rating=re.findall(findRating,item)[0]
data.append(rating) #添加评分
judge=re.findall(findJudege,item)[0]
data.append(judge) #添加评价人数
inq=re.findall(findInq,item)
if len(inq)!=0:
inq=inq[0].replace("。","") #去掉概述后面的句号
data.append(inq) #添加电影概况
else:
data.append(" ")
bd=re.findall(findBd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?'," ",bd)
bd=re.sub('/'," ",bd)
data.append(bd.strip())
datalist.append(data)
#print(datalist)
return datalist
3、将数据保存到数据库
def savedb(datalist,dbpath):
init_db(dbpath)
conn=sqlite3.connect(dbpath)
cur=conn.cursor()
for data in datalist:
for index in range(len(data)):
if index==4 or index==5:
continue
data[index]='"'+data[index]+'"'
sql='''
insert into movie250
(infor_link,pic_link,cname,ename,socre,rated,instroduction,infor)
values(%s)'''%",".join(data)
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
##初始化数据库
def init_db(dbpath):
sql='''
create table movie250
(
id integer primary key autoincrement,
infor_link text,
pic_link text,
cname varchar,
ename varchar,
socre numeric,
rated numeric,
instroduction text,
infor text
)
'''
conn=sqlite3.connect(dbpath)
cursor=conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
?4、主函数部分的调用
def main():
baseurl="https://movie.douban.com/top250?start=100&filter=?"
#1、爬取网页
datalist=getData(baseurl)
#savepath="豆瓣电影top250.xls"
dbpath="movie.db"
#3、保存数据
#saveData(datalist,savepath)
#askURL("https://movie.douban.com/")
savedb(datalist,dbpath)
5、结果示意图如下所示
附(豆瓣电影Top250的完整代码):
from bs4 import BeautifulSoup
import re
import urllib.request
import xlwt
import sqlite3
def main():
baseurl="https://movie.douban.com/top250?start=100&filter=?"
#1、爬取网页
datalist=getData(baseurl)
#savepath="豆瓣电影top250.xls"
dbpath="movie.db"
#3、保存数据
#saveData(datalist,savepath)
#askURL("https://movie.douban.com/")
savedb(datalist,dbpath)
findlink=re.compile(r'<a href="(.*?)">') #影片超链接的规则
findImgSrc=re.compile(r'<img.*src="(.*?)"',re.S) #re.S让换行符包含在字符中 ,影片图片的规则
findTitle=re.compile(r'<span class="title">(.*)</span>') #影片片名字
findRating=re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')#影片评分
findJudege=re.compile(r'<span>(\d*)人评价</span>')#找到评价人数
findInq=re.compile(r'<span class="inq">(.*)</span>')#找到概况
findBd=re.compile(r'<p class="">(.*?)</p>',re.S)
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
" (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"
}
req = urllib.request.Request(url, headers=head)
html=""
try:
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
#print(html)
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return (html)
def getData(baseurl):
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askURL(url)
#2、逐一解析数据
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"): #查找符合要求的字符串,形成列表
data=[]
item=str(item)
link=re.findall(findlink,item)[0] #获取影片的超链接
data.append(link) #添加连接
imgSrc=re.findall(findImgSrc,item)[0]
data.append(imgSrc) #添加图片
titles=re.findall(findTitle,item) #片名可能只有一个中文名,没有外国名
if (len(titles)==2):
ctitle=titles[0]
data.append(ctitle) #添加中文名
otitle=titles[1].replace("/","")#将中文名字和美国名字之间的/去掉
data.append(otitle) #添加外国名
else:
data.append(titles[0])
data.append(' ')
rating=re.findall(findRating,item)[0]
data.append(rating) #添加评分
judge=re.findall(findJudege,item)[0]
data.append(judge) #添加评价人数
inq=re.findall(findInq,item)
if len(inq)!=0:
inq=inq[0].replace("。","") #去掉概述后面的句号
data.append(inq) #添加电影概况
else:
data.append(" ")
bd=re.findall(findBd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?'," ",bd)
bd=re.sub('/'," ",bd)
data.append(bd.strip())
datalist.append(data)
#print(datalist)
return datalist
def saveData(datalist,savepath):
book = xlwt.Workbook(encoding="utf-8") # 创建对象
sheet = book.add_sheet("豆瓣电影top250") # 创建工作表
col=("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价人数","概况","相关信息")
for i in range(0,8):
sheet.write(0,i,col[i])
for i in range(0,250):
print("第%d条"%(i+1))
data=datalist[i]
for j in range(0,8):
sheet.write(i+1,j,data[j])
book.save(savepath)
def savedb(datalist,dbpath):
init_db(dbpath)
conn=sqlite3.connect(dbpath)
cur=conn.cursor()
for data in datalist:
for index in range(len(data)):
if index==4 or index==5:
continue
data[index]='"'+data[index]+'"'
sql='''
insert into movie250
(infor_link,pic_link,cname,ename,socre,rated,instroduction,infor)
values(%s)'''%",".join(data)
print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(dbpath):
sql='''
create table movie250
(
id integer primary key autoincrement,
infor_link text,
pic_link text,
cname varchar,
ename varchar,
socre numeric,
rated numeric,
instroduction text,
infor text
)
'''
conn=sqlite3.connect(dbpath)
cursor=conn.cursor()
cursor.execute(sql)
conn.commit()
conn.close()
if __name__=="__main__":
main()
print("爬虫成功")
?二、东方财富网数据
(一)静态title的爬取过程
将平安银行(000001)下的标题爬取下来

1、获取网页--与爬取豆瓣电影top250获取方式一样
##1、获取网页
def askURL(url):
head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"}
req = urllib.request.Request(url, headers=head)
html=""
try:
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
#print(html)
except urllib.error.URError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return (html)
?2、取得数据--利用正则表达式及其BeautifulSoup解析数据
def getData(baseurl):
ls=[]
html = askURL(baseurl)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('td'):
item = str(item)
data=re.findall(finddata,item)
if len(data)!=0:
ls.append(data[0][1])
#print(ls)
?3、主函数调用
def main():
baseurl="http://data.eastmoney.com/zjlx/000001.html"
askURL(baseurl)
getData(baseurl)?
4、结果示意图

附(完整代码):
import re
import urllib.request
from bs4 import BeautifulSoup
def main():
baseurl="http://data.eastmoney.com/zjlx/000001.html"
askURL(baseurl)
getData(baseurl)
finddata=re.compile(r'<a href="(.*?)">(.*?)</a>')
def askURL(url):
head = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31"}
req = urllib.request.Request(url, headers=head)
html=""
try:
response=urllib.request.urlopen(req)
html=response.read().decode("utf-8")
#print(html)
except urllib.error.URError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e,"reason"):
print(e.reason)
return (html)
def getData(baseurl):
ls=[]
html = askURL(baseurl)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('td'):
item = str(item)
data=re.findall(finddata,item)
if len(data)!=0:
ls.append(data[0][1])
print(ls)
if __name__=="__main__":
main()
(二)动态数据的爬取并将其保存到数据库中
(参考链接:
(13条消息) 爬虫实战 | 爬取东方财富网股票数据_cainiao_python的博客-CSDN博客_爬取东方财富网数据)
将下图中的动态数据爬取出来

1、完整代码如下所示
import requests
import re
import pymysql
def main():
##1、连接数据库
db=pymysql.connect(host='localhost',port=3306,user='root',password='123abc',database='test1',charset='utf8')
cursor = db.cursor()
##2、创建表格
sql='''create table df(
id int primary key not null,
daimas text not null,
names text not null,
zuixinjias float not null,
zhangdiefus float not null,
zhangdiees float not null,
chengjiaoliangs float not null,
chengjiaoes float not null,
zhenfus float not null,
zuigaos float not null,
zuidis float not null,
jinkais float not null,
zuoshous float not null,
liangbis float not null,
huanshoulvs float not null,
shiyinglvs float not null
)
'''
cursor.execute(sql)
db.commit() ##提交数据库操作
db.close() ##关闭数据库
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36 Edg/94.0.992.31'}
for page in range(1,100):
params = {
'cb':'jQuery1124014630218253883864_1632748641836',
'pn': str(page),
'pz':'20',
'po':'1',
'np':'1',
'ut': 'bd1d9ddb04089700cf9c27f6f7426281',
'fltt': '2',
'invt': '2',
'fid': 'f3',
'fs': 'm:1 t:2,m:1 t:23',
'fields': 'f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152'
}
response = requests.get('http://16.push2.eastmoney.com/api/qt/clist/get', headers=headers, params=params)
daimas = re.findall('"f12":(.*?),',response.text)
names = re.findall('"f14":"(.*?)"',response.text)
zuixinjias = re.findall('"f2":(.*?),',response.text)
zhangdiefus = re.findall('"f3":(.*?),',response.text)
zhangdiees = re.findall('"f4":(.*?),',response.text)
chengjiaoliangs = re.findall('"f5":(.*?),',response.text)
chengjiaoes = re.findall('"f6":(.*?),',response.text)
zhenfus = re.findall('"f7":(.*?),',response.text)
zuigaos = re.findall('"f15":(.*?),',response.text)
zuidis = re.findall('"f16":(.*?),',response.text)
jinkais = re.findall('"f17":(.*?),',response.text)
zuoshous = re.findall('"f18":(.*?),',response.text)
liangbis = re.findall('"f10":(.*?),',response.text)
huanshoulvs = re.findall('"f8":(.*?),',response.text)
shiyinglvs = re.findall('"f9":(.*?),',response.text)
ls = []
ls1 = []
for i in range(20):
ls1.append(daimas[i])
ls1.append(names[i])
ls1.append(zuixinjias[i])
ls1.append(zhangdiefus[i])
ls1.append(zhangdiees[i])
ls1.append(chengjiaoliangs[i])
ls1.append(chengjiaoes[i])
ls1.append(zhenfus[i])
ls1.append(zuigaos[i])
ls1.append(zuidis[i])
ls1.append(jinkais[i])
ls1.append(zuoshous[i])
ls1.append(liangbis[i])
ls1.append(huanshoulvs[i])
ls1.append(shiyinglvs[i])
if len(ls1) == 15:
ls.append(ls1)
ls1 = []
##将内容写到之前创建好的数据库中
query = "insert into df values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
for j in ls:
cursor = db.cursor()
db.ping(reconnect=True)
cursor.execute(query, j)
db.commit()
db.close()
if __name__ == '__main__':
main()
print("爬取成功!")
?
?2、结果示意图


未经本人同意,禁止转载!!!
?
?
?
?
?
?
?
?
?
?
?
?
?
?
?
?
|