NRC词典简介:
NRC词典由加拿大国家研究委员会(简称NRC)的专家创建,目前已有多种语言版本,我们可以用其中文版本来进行情感分析。使用一系列单词来帮助识别情绪,情感,并分析标签,表情符号和单词颜色的关联。
词典中包含情绪的类别主要有以下几种:
我们可以用以下代码看词典中情感相关内容:
import pandas as pd
lexion_df = pd.read_excel('E:/JupyterProject/mybook-main/mybook-main/data/Textmining/NRC-Emotion-Lexicon-v0.92-In105Languages-Nov2017Translations.xlsx')
lexion_df.head()
lexion_df.columns.tolist()
chinese_df = lexion_df[['Chinese (Simplified) (zh-CN)','Positive','Negative','Anger','Anticipation','Disgust','Fear','Joy','Sadness','Surprise','Trust']]
chinese_df.head()
前几行数据如下:
?用如下代码构建词语列表:
Positive, Negative, Anger, Anticipation, Disgust, Fear, Joy, Sadness, Surprise, Trust= [[] for i in range(10)]
for idx, row in chinese_df.iterrows():
if row['Positive']==1:
Positive.append(row['Chinese (Simplified) (zh-CN)'])
if row['Negative']==1:
Negative.append(row['Chinese (Simplified) (zh-CN)'])
if row['Anger']==1:
Anger.append(row['Chinese (Simplified) (zh-CN)'])
if row['Anticipation']==1:
Anticipation.append(row['Chinese (Simplified) (zh-CN)'])
if row['Disgust']==1:
Disgust.append(row['Chinese (Simplified) (zh-CN)'])
if row['Fear']==1:
Fear.append(row['Chinese (Simplified) (zh-CN)'])
if row['Joy']==1:
Joy.append(row['Chinese (Simplified) (zh-CN)'])
if row['Sadness']==1:
Sadness.append(row['Chinese (Simplified) (zh-CN)'])
if row['Surprise']==1:
Surprise.append(row['Chinese (Simplified) (zh-CN)'])
if row['Trust']==1:
Trust.append(row['Chinese (Simplified) (zh-CN)'])
print('词语列表构建完成')
用下面的代码计算句子各种情感的词汇数量:
import jieba
import time
def emotion_caculate(text):
positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, trust = [0 for i in range(10)]
line = processing(text) #数据清洗
wordlist = seg_depart(line) #将句子分词并去除停用词(分词函数之前的文章有介绍)
wordset = set(wordlist) #将列表转化为集合(去重)
wordfreq = []
for word in wordset:
freq = wordlist.count(word)
if word in Positive:
positive+=freq
if word in Negative:
negative+=freq
if word in Anger:
anger+=freq
if word in Anticipation:
anticipation+=freq
if word in Disgust:
disgust+=freq
if word in Fear:
fear+=freq
if word in Joy:
joy+=freq
if word in Sadness:
sadness+=freq
if word in Surprise:
surprise+=freq
if word in Trust:
trust+=freq
emotion_info = {
'positive': positive,
'negative': negative,
'anger': anger,
'anticipation': anticipation,
'disgust': disgust,
'fear':fear,
'joy':joy,
'sadness':sadness,
'surprise':surprise,
'trust':trust,
'length':len(wordlist)
}
score_list = list(emotion_info.values())
return score_list
计算文档中各个句子包含的情感强度(用词汇数量表示)并存到另一个文件中:
def emotion_analyse():
filename = r'C:\test.csv'
senti_count = []
with open(filename, 'r', encoding='UTF-8') as csvfile:
next(csvfile)
reader = csv.reader(csvfile)
for row in reader:
score = emotion_caculate(row[1])
senti_count.append(score)
print(senti_count)
name = [ 'positive', 'negative', 'anger', 'anticipation','disgust','fear','joy','sadness','surprise','trust','length']
test = pd.DataFrame(columns=name, data=senti_count)
test.to_csv(r"C:\test_result.csv")
|