首先是单条评论数据。 观察到用户名和id是在同一节点,如
‘<a_href="/u/3173923450?gid=10001">兵卒在江湖</a’
然后依次确定评论内容和时间的元素节点。 之后获取本页的所有评论数据。 翻页操作是对网址中rl的值进行自增,0和1都是第一页。
根据上述分析,使用m.weibo.cn的接口,即可实现:
import xlrd
import re
import requests
import xlwt
import os
import time as t
import random
import numpy as np
import datetime
import urllib3
from multiprocessing.dummy import Pool as ThreadPool
urllib3.disable_warnings()
cookie=''
headers = {
'Accept-Encoding': 'gzip, deflate, sdch',
'Accept-Language': 'en-US,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Referer': 'https://www.baidu.com/',
'Connection': 'keep-alive',
'Cookie': cookie,
}
def require(url):
"""获取网页源码"""
while True:
try:
response = requests.get(url, headers=headers,timeout=(30,50),verify=False)
code_1=response.status_code
if code_1==200:
print('正常爬取中,状态码:'+str(code_1))
t.sleep(random.randint(1,2))
break
else:
print('请求异常,重试中,状态码为:'+str(code_1))
t.sleep(random.randint(2,3))
continue
except:
t.sleep(random.randint(2,3))
continue
html=response.text
return html
def html_1(url):
html=require(url)
try:
page=re.findall(' 1/(.*?)页',html,re.S)
page=int(page[0])
except:
page=0
return html,page
def count(alls):
n=0
for all in alls:
for i in all:
n=n+1
return n
def body(h_1):
html_2=re.findall('<div class="c" id="C.*?">(.*?)</div>',str(h_1),re.S)
html_2=str(html_2)
user_ids=re.findall('<a href=".*?&fuid=(.*?)&.*?">举报</a> ',html_2,re.S)
names_0=re.findall('<a href=.*?>(.*?)</a>',html_2,re.S)
names=[]
ma=[ '举报', '赞[]', '回复']
pattern = re.compile(r'\d+')
for i in names_0:
i=re.sub(pattern, "", i)
if i not in ma:
if '@' not in i:
names.append(i)
pattern_0= re.compile(r'回复<a href=.*?</a>:')
pattern_0_1= re.compile(r'<a href=.*?</a>')
pattern_0_2= re.compile(r'<img alt=.*?/>')
contents=[]
contents_2=[]
contents_0=re.findall('<span class="ctt">(.*?)</span>',html_2,re.S)
contents_1=re.findall('<a href=.*?>@.*?</a>(.*?)<a href=.*?>举报</a> ',html_2,re.S)
for i in contents_0:
i=re.sub(pattern_0,'',i)
i=re.sub(pattern_0_1,'',i)
i=re.sub(pattern_0_2,'',i)
i=i.replace(':','')
i=i.strip()
contents_2.append(i)
for i in contents_1:
i=re.sub(pattern_0,'',i)
i=re.sub(pattern_0_1,'',i)
i=re.sub(pattern_0_2,'',i)
i=i.replace('</span>','')
i=i.replace(' ','')
i=i.replace(':','')
i=i.strip()
contents_2.append(i)
for i in contents_2:
i=re.sub('\s','',i)
if len(i)==0:
pass
else:
contents.append(i)
times_0=re.findall('<span class="ct">(.*?)</span>',html_2,re.S)
times=[]
pattern_1= re.compile(r'\d{2}月\d{2}日')
for i in times_0:
try:
t_1= re.match(pattern_1, i).group()
except:
a=datetime.datetime.now().strftime('%m%d')
t_1=a[:2]+'月'+a[2:]+'日'
times.append(t_1)
all=[]
for i in range(len(user_ids)):
try:
al=[user_ids[i],names[i],contents[i],times[i]]
except:
j='空'
contents.append(j)
al=[user_ids[i],names[i],contents[i],times[i]]
all.append(al)
return all
def save_afile(alls,filename):
"""保存在一个excel"""
f=xlwt.Workbook()
sheet1=f.add_sheet(u'sheet1',cell_overwrite_ok=True)
sheet1.write(0,0,'用户ID')
sheet1.write(0,1,'用户名')
sheet1.write(0,2,'评论内容')
sheet1.write(0,3,'时间')
i=1
for all in alls:
for data in all:
for j in range(len(data)):
sheet1.write(i,j,data[j])
i=i+1
f.save(r'今年/'+filename+'.xls')
def extract(inpath,l):
"""取出一列数据"""
data = xlrd.open_workbook(inpath, encoding_override='utf-8')
table = data.sheets()[0]
nrows = table.nrows
ncols = table.ncols
numbers=[]
for i in range(1, nrows):
alldata = table.row_values(i)
result = alldata[l]
numbers.append(result)
return numbers
def run(ids):
b=ids[0]
u=str(ids[1]).replace('.0','')
alls=[]
pa=[]
url='https://weibo.cn/comment/'+str(b)+'?uid='+str(u)
html,page=html_1(url)
if page==0:
try:
data_1=body(html)
except:
data_1=pa
alls.append(data_1)
else:
for j in range(1,page+1):
if j>=51:
break
else:
url_1=url+'&rl=1'+'&page='+str(j)
htmls,pages=html_1(url_1)
alls.append(body(htmls))
t.sleep(1)
print('共计'+str(page)+'页,共有'+str(count(alls))+'个数据')
save_afile(alls,b)
print('微博号为'+str(b)+'的评论数据文件、保存完毕')
if __name__ == '__main__':
bid=extract('..//1.微博正文爬取//正文_2.xlsx',1)
uid=extract('..//1.微博正文爬取//正文_2.xlsx',2)
ids=[]
for i,j in zip(bid,uid):
ids.append([i,j])
pool = ThreadPool()
pool.map(run, ids)
速度可以自定义,修改t.sleep的值即可。 记得加入自己的cookie。 输入url举例: https://weibo.cn/comment/JAwXz9QrW?uid=1974576991 JAwXz9QrW是微博正文的bid 1974576991就是发布博文的用户的id。 遍历即可。可以利用多线程加快速度。
除了这个爬虫,还有用户信息爬虫等基于微博平台的舆情分析程序,参见:
https://github.com/stay-leave/weibo-public-opinion-analysis
|