获取豆瓣电影(完结!!)
import bs4
from bs4 import BeautifulSoup
import re
import urllib.request,urllib.error
import xlwt
import sqlite3
def getData(baseurl):
datalist=[]
for i in range(0,10):
url=baseurl+str(i*25)
html=askurl(url)
soup=BeautifulSoup(html,"html.parser")
for item in soup.find_all('div',class_="item"):
data=[]
item=str(item)
link=re.findall(re.compile(r'<a href="(.*?)">'),item)[0]
data.append(link)
findimg=re.compile(r'<img.*src="(.*?)"',re.S)
Img=re.findall(findimg,item)[0]
data.append(Img)
findname=re.compile(r'<span class="title">(.*?)</span>',re.S)
name=re.findall(findname, item)
if len(name)==2:
cname=name[0]
data.append(cname)
ename=name[1].replace('/',"")
data.append(ename)
else:
data.append(name[0])
data.append(" ")
find_rate=re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>',re.S)
rate=re.findall(find_rate, item)[0]
data.append(rate)
find_num=re.compile(r'<span>(.*?)</span>',re.S)
num=re.findall(find_num, item)[0]
data.append(num)
find_inq=re.compile(r'<span class="inq">(.*?)</span>',re.S)
inq=re.findall(find_inq, item)
if len(inq) !=0:
inq=inq[0]
else:
data.append(" ")
data.append(inq)
find_bd=re.compile(r'<p class="">(.*?)</p>',re.S)
bd=re.findall(find_bd,item)[0]
bd=re.sub('<br(\s+)?/>(\s+)?'," " ,bd)
bd=re.sub('/', ' ', bd)
data.append(bd.strip())
datalist.append(data)
return datalist
def askurl(url):
head={
"User-Agent":" Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
request=urllib.request.Request(url,headers=head)
html=""
try:
response=urllib.request.urlopen(request)
html=response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e,"code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveDate(datalist,savepath):
workbook=xlwt.Workbook(encoding="utf-8",style_compression=0)
workshep=workbook.add_sheet('豆瓣电影Top250.xls',cell_overwrite_ok=True)
col=("电影详情链接","图片链接","影片中文名","影片外国名","评分","评价数","概况","相关信息")
for i in range(0,8):
workshep.write(0,i,col[i])
for i in range(0,250):
print(f"第{i+1}条")
data =datalist[i]
for j in range(0,8):
workshep.write(i+1,j,data[j])
workbook.save('豆瓣电影Top250.xls')
basurl="https://movie.douban.com/top250?start="
datalist=getData(basurl)
savepath="豆瓣电影Top250.xls"
saveDate(datalist,savepath)
print("爬取完毕!")
获取一个get请求
import urllib.request
response=urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8'))
获取一个post请求
import urllib.parse
data1=bytes(urllib.parse.urlencode({"hello":"world"}),encoding="utf-8")
response=urllib.request.urlopen("http://httpbin.org/post",data=data1)
print(response.read().decode("utf-8"))
超时处理
try:
response=urllib.request.urlopen("http://httpbin.org/get",timeout=0.1)
print(response.read().decode("utf-8"))
except urllib.error.URLError as e:
print("Time out")'''
response=urllib.request.urlopen("https://www.bilibili.com")
print(response.getheaders())'''
伪装浏览器
url="https://httpbin.org/post"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
data=bytes(urllib.parse.urlencode({'name':'eric'}),encoding="utf-8")
req=urllib.request.Request(url=url,data=data,headers=headers,method="POST")
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
爬进豆瓣网
url="https://movie.douban.com/top250"
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36"
}
req=urllib.request.Request(url=url,headers=headers)
response=urllib.request.urlopen(req)
print(response.read().decode("utf-8"))
正则表达式函数基本使用
import re
'''pat=re.compile("AAA")
m=pat.search("AAAGGFREGAAA").group()
print(m)
print(re.findall('a', 'reaafra')) '''
print(re.findall('[A-Z]+', 'weqGREG'))
print(re.sub("a", 'A', 'aregreGEA'))
print(r"/aeg/'")
|