import os
import pandas as pd
'''
函数功能:创建实验数据集
参数:无参数
返回:
dataSet:带标签的实验数据集(DF格式)
'''
def get_dataSet():
ham=[]
for i in range(1,26):
file_path='../MLinAction_source/email/ham/%d.txt'%(i)
data=open(file_path,encoding='gbk',errors='ignore').read()
ham.append([data,'ham'])
df1=pd.DataFrame(ham)
spam=[]
for i in range(1,26):
file_path='../MLinAction_source/email/spam/%d.txt'%(i)
data=open(file_path,encoding='gbk',errors='ignore').read()
spam.append([data,'spam'])
df2=pd.DataFrame(spam)
dataSet=pd.concat([df1,df2],ignore_index=True)
return dataSet
dataSet=get_dataSet()
print(dataSet)
'''
TfidfVectorizer = TfidfTransformer + CountVectorizer
CountVectorizer 的用途就是将文本文档转换为计数矩阵,
TfidfTransformer 的用途就是将计数矩阵转换为标准化的tf或tf-idf.
feature_extraction:特征抽取
Tf(term-frequency):词频,词语在文档中出现的频率
idf(inverse document frequency):逆文档频率
Tfidf:词频*逆文档频率
'''
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
tf.fit(dataSet[0])
data_tf=tf.transform(dataSet[0])
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest=train_test_split(data_tf,dataSet[1],test_size=0.2)
xtest.shape[0]
ytest
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
mnb=MultinomialNB()
mnb.fit(xtrain,ytrain)
acc=mnb.score(xtest,ytest)
print('多项式分布朴素贝叶斯准确率:',acc)
bnb=BernoulliNB()
bnb.fit(xtrain,ytrain)
acc=bnb.score(xtest,ytest)
print('伯努利分布朴素贝叶斯准确率:',acc)
from sklearn.model_selection import cross_val_score
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['Simhei']
mnbs=[]
bnbs=[]
for i in range(10):
mnb=MultinomialNB()
mnb_s=cross_val_score(mnb,data_tf,dataSet[1],cv=10).mean()
mnbs.append(mnb_s)
bnb=BernoulliNB()
bnb_s=cross_val_score(bnb, data_tf, dataSet[1], cv=10).mean()
bnbs.append(bnb_s)
plt.plot(range(1,11),mnbs,label="多项式朴素贝叶斯")
plt.plot(range(1,11),bnbs,label="伯努利朴素贝叶斯")
plt.legend()
plt.show()
Email数据集:
链接 提取码:k9n3
|