目录
代码
部分结果
参考文献
你需要修改的是代码中的"User-Agent"、"Cookie"
代码
# -*- codeing = utf-8 -*-
# @Time : 2021/11/29 13:47
# @Author : My_progress1
# @File : 微博热播50.py
# @Software : PyCharm
import urllib.request
from bs4 import BeautifulSoup
import re
import random
import time
import urllib.request,urllib.error
import xlwt
def main():
#1.数据抓取
url = 'https://s.weibo.com/top/summary'
savepath=".\\微博50.xls"
datalist = getData(url)
saveData(datalist,savepath)
#得到指定一个URL的网页内容
def askUrl(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
'Cookie':'login_sid_t=279de679455657cacb79ec4065a0904e; cross_origin_proto=SSL; Apache=1190887741059.4321.1637751563454; SINAGLOBAL=1190887741059.4321.1637751563454; _s_tentry=passport.weibo.com; ULV=1637751563457:1:1:1:1190887741059.4321.1637751563454:; WBtopGlobal_register_version=2021112818; webim_unReadCount=%7B%22time%22%3A1638100408403%2C%22dm_pub_total%22%3A4%2C%22chat_group_client%22%3A0%2C%22chat_group_notice%22%3A0%2C%22allcountNum%22%3A8%2C%22msgbox%22%3A0%7D; UOR=,,www.baidu.com; SUB=_2A25MoNiWDeRhGeNP6VoX8SrFzjiIHXVv1E1erDV8PUNbmtAKLW3akW9NToltBT0K5O16dRj_Sap_RIllhgDAGM7Y; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9W5QUz5lMFNL6ksJ_NuE.B3e5JpX5KzhUgL.Fo-peonceKB4SKB2dJLoIpB4gFH8SCHWSFHFeCH81FHWxbHWx5tt; ALF=1669717061; SSOLoginState=1638181062'
}
request = urllib.request.Request(url, headers=headers)
html = ""
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
return html
#print(response.text)
findnum=re.compile(r'<td class="td-01 ranktop">(.*?)</td>', re.S)
findhot=re.compile(r'<span>(.*?)</span>', re.S)
findcomment=re.compile(r'target="_blank">(.*?)</a>',re.S)
findstyle=re.compile(r'<i class="icon-txt" style="background-color:#ff3852">(.*)</i>',re.S)
#保存数据
def getData(baseuel):
datalist = []
html=askUrl(baseuel)
soup = BeautifulSoup(html, "html.parser")
for item in soup.find_all('tr',class_=""):
data = []
item = str(item)
try:
num = re.findall(findnum,item)
if len(num)==0:
continue
else:
num=num[0]
print("正在爬取第",num,"条")
data.append(num)
comment = re.findall(findcomment, item)[0]
data.append(comment)
hot = re.findall(findhot,item)[0]
hot = hot.replace(" ","")
data.append(hot)
style = re.findall(findstyle,item)
if len(style)==0:
style=" "
data.append(style)
else:
style=style[0]
data.append(style)
except (IndexError,AttributeError) as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
a = random.random()
time.sleep(1 + 0.5 * a)
datalist.append(data)
return datalist
def saveData(datalist,savepath):
book=xlwt.Workbook(encoding="utf-8",style_compression=0)
sheet = book.add_sheet('weibo',cell_overwrite_ok=True)
col = ('编号',"题目","热度","性质")
for i in range(0,4):
sheet.write(0,i,col[i])
for i in range(0,50):
data = datalist[i]
for j in range(0,4):
sheet.write(i+1,j,data[j])
book.save(savepath)
print("save..")
#这也是一个判断语句,上面就是函数,下面才是程序的运行次序==次序
if __name__ == "__main__":
main()
print("爬取完毕")
部分结果
参考文献
Python爬虫编程基础5天速成(2021全新合集)Python入门+数据分析_哔哩哔哩_bilibili
?
|