二、实验主要内容 离散化鲍鱼数据。 划分训练集与测试集,并标准化数据。 构建分类模型。 三、实验仪器设备 Windows Anaconda Spyder 四、实验步骤 1、离散化鲍鱼数据 由于原始数据中”sex”一列取值为字符,无法进行数学运算,将该列转换为哑变量。 2、划分训练集与测试集 1)分别按照8:2和5:5的比例划分训练集和测试集; 2)对训练集和测试集进行标准化; 3、构建分类模型 1)构建KNN模型; 2)训练KNN模型,预测测试集鲍鱼的年龄;对年龄预测结果进行准确率分析并做图形化展示;对比不同训练集/测试集比例下的预测结果并做图形化展示。 实验代码
import pandas as pd
import numpy as np
data = pd.read_csv('C://Users//ASUS//Desktop//abalone.data')
data1 = pd.get_dummies(data['sex'])
data = data.drop(['sex'],axis=1)
data.insert(0,'F',data1['F'])
data.insert(1,'I',data1['I'])
data.insert(2,'M',data1['M'])
data_index = data[data['rings']==1].index.tolist()[0]
a =data["rings"].value_counts() / len(data)
data_rings = data['rings']
print(data_rings.describe())
data2=data.iloc[data['rings'],[ 0,1,2,3,4,5,6,7,8,9]]
data2 = np.array(data2)
print(data2)
print('数据集的形状为:',data2.shape)
print(type(data2))
data_rings = np.array(data["rings"])
print(data_rings)
print(type(data_rings))
print(data2)
print(data_rings)
print('数据集的形状为:',data_rings.shape)
"""
for train_index, test_index in ss.split(data2, data_rings):
print("TRAIN_INDEX:", train_index, "TEST_INDEX:", test_index)#获得索引值
X_train, X_test = data2[train_index], data2[test_index]#训练集对应的值
y_train, y_test = data_rings[train_index], data_rings[test_index]#类别集对应的值
"""
from sklearn.model_selection import train_test_split
data2_train,data2_test,data_rings_train,data_rings_test = \
train_test_split(data2,data_rings,
test_size = 0.2,random_state = 22)
print('训练集数据的形状为:',data2_train.shape)
print('训练集标签的形状为:',data_rings_train.shape)
print('测试集数据的形状为:',data2_test.shape)
print('测试集标签的形状为:',data_rings_test.shape)
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(data2_train, data_rings_train)
data_rings_predict = knn.predict(data2_test)
print('测试数据集的预测标签为:\n ',
[data_rings_predict])
print('测试数据集的正确标签为:\n ',
[data_rings_test])
data_T = []
data_F = []
for i in range(0,len(data_rings_predict)):
if data_rings_predict[i] == data_rings_test[i]:
data_T.append(data_rings_predict[i])
else:
data_F.append(data_rings_predict[i])
print('预测正确的数据为:',data_T)
print('预测错误的数据为',data_F)
print(len(data_T))
print(len(data_F))
print('准确率为:',len(data_T)/(len(data_T)+len(data_F)))
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus'] = False
plt.figure()
labels = ["预测正确数据", "预测错误数据"]
fracs = [len(data_T),len(data_F)]
plt.title('预测结果展示图')
exp = [0.05,0.2]
plt.pie(x=fracs,labels=labels,explode=exp,shadow=True)
plt.show()
|