需求
根据第1部分自然语言处理教学内容,请选择一本你喜欢的小说,利用上课讲的但不限于授课内容,对该小说进行分析。比如分析该小说的分词,词频,词性,小说人物出场次数排序,小说中食物排序(这个得有,我喜欢吃),小说人物关系等等。
1、前期准备
1.1 导入库
1.2 小说、用户字典、食物清单、停用词等txt文档 和 字体simfang.ttf 以及词云用到的图片
以上资料自行百度下载 或者 自我总结
2、源码
'''
Autor: 何邦渊
DateTime: 2022/3/20 21:24
IDE: PyCharm
Function: 根据第1部分自然语言处理教学内容,请选择一本你喜欢的小说,利用上课讲的但不限于授课内容,对该小说进行分析。比如分析该小说的分词,词频,
词性,小说人物出场次数排序,小说中食物排序(这个得有,我喜欢吃),小说人物关系等等。
要求:1代码以py文件附件形式上传,有功能性注释和普通注释。
2.功能介绍和运行结果截图可以在作业里写上。
3.小说文件用txt形式存储。
4.最后视功能完整性给分.
'''
import random
import networkx as nx
from imageio import imread
from wordcloud import WordCloud,ImageColorGenerator
import jieba
import jieba.posseg as pseg
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.font_manager import FontProperties
excludes = ['乐章','小姑娘','荣耀','易拉灌','易容术','明白','全明星','蓝溪阁','季后赛','本赛季','砰砰','和兴欣','上赛季','华丽','司仪',
'西风','连胜','银武','周旋','马踏','安静','大屏幕','和嘉世','修正','了兴欣','卫星','谢谢','呼啸山庄','马甲','明星','英勇',
'真是太','冷不丁','小精灵','高潮','太久','布阵','祝福','段时间','格斗','高水平','言语','别提','冷笑','晓枪','白痴','赛中',
'顾忌','越来越近','封锁','小镇','贡献度','高阶','嘉世']
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
font = FontProperties(fname=r"C:\Python\src\python与数据分析\simfang.ttf", size=14)
def open_text(path):
with open(path,'r',encoding='utf-8') as f:
return [line.strip() for line in f.readlines()]
def seg_depart(path,total):
outstr = ''
stopwords = open_text('.\stopword.txt')
with open(path,'r',encoding='utf-8') as text:
for line in text:
sentence_depart = pseg.cut(line.strip())
for word,flag in sentence_depart:
if word not in stopwords and word != '\t' and word != '' and len(word) >=2 and word.isdigit()!=True:
total[(word,flag)] = total.get((word,flag),0) + 1
outstr += word
with open('./全职高手分词词频词性.txt','w',encoding='utf-8') as text1:
for key,value in total.items():
text1.write('%s,%s,%d\n' %(key[0],key[1],value))
with open('./纯净版全职高手.txt','w',encoding='utf-8') as text2:
text2.write(outstr)
return total
def character_sequence(total):
sequence = {}
for key,value in total.items():
if key[1]=='nr':
if key[0] == '叶修' or key[0] == '君莫笑':
word = '叶修'
elif key[0] == '苏沐橙' or key[0] == '沐雨橙风':
word = '苏沐橙'
elif key[0] == '方锐' or key[0] == '海无量':
word = '方锐'
elif key[0] == '唐柔' or key[0] == '寒烟柔':
word = '唐柔'
elif key[0] == '乔一帆' or key[0] == '一寸灰':
word = '乔一帆'
elif key[0] == '包荣兴' or key[0] == '包子入侵':
word = '包荣兴'
elif key[0] == '罗辑' or key[0] == '昧光':
word = '罗辑'
elif key[0] == '莫凡' or key[0] == '毁人不倦':
word = '莫凡'
elif key[0] == '安文逸' or key[0] == '小手冰凉':
word = '安文逸'
elif key[0] == '陈果' or key[0] == '逐烟霞':
word = '陈果'
elif key[0] == '魏琛' or key[0] == '迎风布阵':
word = '魏琛'
elif key[0] == '孙翔' or key[0] == '一叶知秋':
word = '孙翔'
elif key[0] == '韩文清' or key[0] == '大漠孤烟':
word ='韩文清'
elif key[0] == '喻文州' or key[0] == '索克萨尔':
word = '喻文州'
elif key[0] == ' 黄少天' or key [0] == '夜雨声烦':
word = '黄少天'
elif key[0] == '王杰希' or key[0] == '王不留行':
word = '王杰希'
else:
word = key[0]
sequence[word] = sequence.get(word,0) + value
for word in excludes:
if sequence.get(word,0) > 0:
del sequence[word]
sequence_new = sorted(sequence.items(),key=lambda x:x[1],reverse=True)
with open('./全职高手人物出场次数排序.txt','w',encoding='utf-8') as f:
for name,num in sequence_new:
f.write('%s,%d\n' %(name,num))
def food_sequence(total):
sequence = {}
food = open_text('./全职高手食物.txt')
for key,value in total.items():
if key[0] in food:
sequence[key[0]] = value
with open('./全职高手食物排序.txt','w',encoding='utf-8') as f:
for word,value in sequence.items():
f.write('%s,%d\n' %(word,value))
colorNum = len(open_text('./全职高手人物.txt'))
def randomcolor():
colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F']
color = ""
for i in range(6):
color += colorArr[random.randint(0, 14)]
return "#" + color
def color_list():
colorList = []
for i in range(colorNum):
colorList.append(randomcolor())
return colorList
def creat_relationship(path):
colors = color_list()
Names = open_text('./全职高手人物.txt')
relations = {}
lst_para = open_text(path)
for text in lst_para:
for name_0 in Names:
if name_0 in text:
for name_1 in Names:
if name_1 in text and name_0 != name_1 and (name_1, name_0) not in relations:
relations[(name_0, name_1)] = relations.get((name_0, name_1), 0) + 1
maxRela = max([v for k, v in relations.items()])
relations = {k: v / maxRela for k, v in relations.items()}
plt.figure(figsize=(15, 15))
G = nx.Graph()
for k, v in relations.items():
G.add_edge(k[0], k[1], weight=v)
elarge = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] > 0.6]
emidle = [(u, v) for (u, v, d) in G.edges(data=True) if (d['weight'] > 0.3) & (d['weight'] <= 0.6)]
esmall = [(u, v) for (u, v, d) in G.edges(data=True) if d['weight'] <= 0.3]
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos, alpha=0.8, node_size=1300, node_color=colors)
nx.draw_networkx_edges(G, pos, edgelist=elarge, width=2.5, alpha=0.9, edge_color='g')
nx.draw_networkx_edges(G, pos, edgelist=emidle, width=1.5, alpha=0.6, edge_color='y')
nx.draw_networkx_edges(G, pos, edgelist=esmall, width=1, alpha=0.4, edge_color='b', style='dashed')
nx.draw_networkx_labels(G, pos, font_size=14)
plt.title("《全职高手》主要人物社交关系网络图")
plt.axis('off')
plt.savefig('./全职高手人物关系图', bbox_inches='tight')
plt.show()
def GetWordCloud():
path_txt = './纯净版全职高手.txt'
path_img = './动漫.jpg'
f = open(path_txt,'r',encoding='utf-8').read()
background_image = imread(path_img)
cut_text = " ".join(jieba.cut(f))
wordcloud = WordCloud(
background_color="white",
mask = background_image,
max_words=400,
width=600,
height=800,
font_path="./simfang.ttf",
max_font_size=50,
min_font_size=10,
random_state=30,
margin=2,
)
wc = wordcloud.generate(cut_text)
image_colors = ImageColorGenerator(background_image)
plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
plt.axis("off")
plt.show()
wc.to_file('./wordcloud.jpg')
def create_wordPhotograph(total):
plt.figure(figsize=(9, 6))
Y = []
sign = []
c = Counter(total).most_common(10)
for word,num in c:
Y.append(num)
sign.append(word[0]+"_"+word[1])
plt.bar(np.arange(10) , Y, width=0.3, facecolor='red', edgecolor='white')
plt.xticks(np.arange(10), sign)
i = 0
X = np.arange(10)
for x, y in zip(X, Y):
plt.text(x + 0.15, y + 0.1, '%d' % (Y[i]), ha='center', va='bottom')
i = i + 1
plt.xlabel(u"分词词性Top10")
plt.ylabel(u"词频数")
plt.title(u"分词词频词性可视化图")
plt.savefig('./分词词频词性可视化图.jpg',bbox_inches='tight')
plt.show()
def create_CharacterPhotograph():
plt.figure(figsize=(9, 6))
Y = []
sign = []
i = 0
text = open_text('./全职高手人物出场次数排序.txt')
for t in text:
if i<10:
tt = t.split(',')
Y.append(int(tt[1]))
sign.append(tt[0])
i+=1
plt.bar(np.arange(10) , Y, width=0.3, facecolor='red', edgecolor='white')
plt.xticks(np.arange(10), sign)
i = 0
X = np.arange(10)
for x, y in zip(X, Y):
plt.text(x + 0.15, y + 0.1, '%d' % (Y[i]), ha='center', va='bottom')
i = i + 1
plt.xlabel(u"出场人物Top10")
plt.ylabel(u"出场次数")
plt.title(u"人物出场次序排序可视化图")
plt.savefig('./人物出场次序排序可视化图.jpg',bbox_inches='tight')
plt.show()
def create_foodPhotograph():
plt.figure(figsize=(9, 6))
Y = []
sign = []
i = 0
text = open_text('./全职高手食物排序.txt')
for t in text:
if i < len(text):
tt = t.split(',')
Y.append(int(tt[1]))
sign.append(tt[0])
i += 1
plt.bar(np.arange(len(text)), Y, width=0.3, facecolor='red', edgecolor='white')
plt.xticks(np.arange(len(text)), sign)
i = 0
X = np.arange(len(text))
for x, y in zip(X, Y):
plt.text(x + 0.15, y + 0.1, '%d' % (Y[i]), ha='center', va='bottom')
i = i + 1
plt.xlabel(u"出场食物名")
plt.ylabel(u"出场次数")
plt.title(u"食物出场排序可视化图")
plt.savefig('./食物出场排序可视化图.jpg', bbox_inches='tight')
plt.show()
def main():
total = {}
jieba.load_userdict("./全职高手用户字典.txt")
total = seg_depart("./全职高手.txt",total)
character_sequence(total)
food_sequence(total)
creat_relationship("./全职高手.txt")
GetWordCloud()
create_wordPhotograph(total)
create_CharacterPhotograph()
create_foodPhotograph()
main()
3、运行结果
3.1 全职高手分词词频词性.txt
3.2 全职高手人物出场次数排序.txt
3.3 全职高手食物排序.txt
3.4 全职高手人物关系图.png
3.5 词云
3.6 分词词频词性可视化图.jpg
3.7 人物出场次序排序可视化图.jpg
3.8 食物出场排序可视化图.jpg
|