已实现功能:爬取评论区内容并且生成云图。然后保存在桌面。 未实现功能:修改词云图形状没有实现。(如:苹果形状)报错没有解决。 视频效果:
创建一个B站评论爬虫改.py文件,这个文件功能是输入一个B站链接。爬取评论区,并且在桌面生成一个.txt文件
import random
import requests
import time
import re
headers = {
# 'user-agent': 'Mozilla/5.0(WindowsNT10.0;Win64;x64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/96.0.4664.110Safari/537.36Edg/96.0.1054.62',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.106 Safari/537.36',
}
# 获取第一个aid作为文件名
def get_oid(url):
resp = requests.get(url, headers=headers).text
# print(resp)
oid = re.findall('"aid":(.*?),', resp)[0]
return oid
def get_second_reply(rpid, page):
page_num = 1
while True:
params = {
'jsonp': 'jsonp',
'pn': page_num,
'type': '1',
'oid': aid,
'ps': '10',
'root': rpid,
'_': '1640190856379',
}
second_url = 'https://api.bilibili.com/x/v2/reply/reply?'
resp = requests.get(url=second_url, headers=headers, params=params).json()
count = resp['data']['page']['count']
replies = resp['data']['replies']
for reply in replies:
reply_create_time = reply['ctime']
reply_uname = reply['member']['uname']
reply_content = reply['content']['message'].replace('\n', '').replace(',', ',')
reply_rpid = reply['rpid']
line = '{}\n'.format(reply_content)
fd.write(line)
print('\r当前页码: {} -*- 二级评论: {}'.format(page, page_num), end='')
time.sleep(random.uniform(0.5, 0.7))
page_num += 1
if page_num * 10 >= count:
break
def get_first_reply():
page_num = 0
while True:
params = {
'jsonp': 'jsonp',
'next': page_num,
'type': '1',
'oid': aid,
'mode': '3',
'plat': '1',
'_': '1640189054670',
}
first_url = 'https://api.bilibili.com/x/v2/reply/main?'
resp = requests.get(url=first_url, headers=headers, params=params).json()
print('\r当前页码: {} -* 二级评论: 0\n'.format(page_num), end='')
page_num = resp['data']['cursor']['next']
replies = resp['data']['replies']
if not replies:
break
for reply in replies:
reply_create_time = reply['ctime']
reply_uname = reply['member']['uname']
reply_content = reply['content']['message'].replace('\n', '').replace(',', ',')
reply_rpid = reply['rpid']
reply_count = reply['rcount']
line = '{}\n'.format( reply_content)
fd.write(line)
if reply_count != 0:
get_second_reply(rpid=reply_rpid, page=page_num)
time.sleep(random.uniform(0.2, 0.7))
aid = get_oid(url=input('url: '))
fd = open('C://Users//Wfy//Desktop//{}.txt'.format(aid), 'a', encoding='utf-8-sig')
def geturl():
get_first_reply()
return '{}'.format(aid)
创建词云增强.py文件,读取第一个py生成的txt文件,并且生成词云图
import jieba # 结巴分词
import wordcloud # 词云图
import imageio # 读取本地图片 修改词云图形
from imageio import imread, imwrite
import cv2
import numpy as np
# img = imageio.imread('../img/apple.jpg')
# img = imread('../img/apple.jpg')
def getPc(path):
img = cv2.imread("../../img/apple.jpg")
# 读取弹幕数据
f = open('C://Users//Wfy//Desktop//{}.txt'.format(path), encoding='utf-8')
text = f.read()
# print(text)
# 分词,把一句话 分割成多个词汇
text_list = jieba.lcut(text)
text_list = [word.strip() for word in text_list if len(word.strip())>1] # 截取大与一个字的词
print(text_list)
# 列表转成字符串
text_str = ' '.join(text_list)
print(text_str)
# 词云图配置
wc = wordcloud.WordCloud(
width=5000, # 宽度
height=5000, # 高度
background_color='white', # 背景颜色
mask=img, # 设置词云形状
stopwords={'和','别人','虽然','我','了','注','doge'},
font_path='msyh.ttc' # 字体文件
)
wc.generate(text_str)
wc.to_file('C://Users//Wfy//Desktop//{}.png'.format(path))
最后写一个总的函数,调用两个py文件。main.py
import B站评论爬虫改
import 词云增强
geturl = B站评论爬虫改.geturl()
print('文件名:'+geturl)
词云增强.getPc(geturl)
|