import requests
import re
from lxml import etree
from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator
import matplotlib.pyplot as plt
def main():
url='https://www.bilibili.com/video/BV1uL41157Nd?spm_id_from=333.851.b_7265636f6d6d656e64.5'
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
res=(requests.get(url,headers=headers)).text
# print(cid)
cid=get_cid(res)
url2 = 'https://comment.bilibili.com/' + cid + '.xml'
content1=get_content(url2)
comments=get_target(content1)
#_print(comments)
_write(comments)
_wordcloud()
#print(get_cid(res))
#print(res.text)
def get_cid(res):
obj1 = re.compile(r'"cid=(?P<cid1>.*?)&aid')
cid = obj1.findall(res)
cid = list(cid)[0]
return cid
def get_content(url2):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36'
}
res2=requests.get(url2,headers=headers)
res2.encoding ='Utf-8'
#print(res2.text)
return res2.text
def get_target(content1):
obj2 = re.compile('<d p=".*?">(.*?)</d>')
comments_list = re.findall(obj2, content1)
# print(comments_list) #在控制台打印所匹配的内容
#print('成功获取弹幕信息')
#print(comments_list)
return comments_list
'''def _print(comments):
for i in comments:
print(i)'''
def _write(commeents):
for i in commeents:
with open('Barrage.txt','a',encoding='utf-8')as f:
f.write(i+'\n')
def _wordcloud():
with open('./Barrage.txt', 'r',encoding='utf-8') as f:
cut_text=f.read()
#print(cut_text)
word_cloud = WordCloud(
font_path="C:/Windows/Fonts/simfang.ttf",#字体路径
background_color="white",
width=1920,
height=1080
).generate(cut_text)
plt.imshow(word_cloud, interpolation="bilinear")
plt.axis('off')
plt.show()
if __name__ =="__main__":
main()
|