import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import pymysql
from sklearn.neighbors import KNeighborsClassifier
train_set = pd.read_csv("E:/soft/adult.data", header=None)
test_set = pd.read_csv("E:/soft/adult.test", header=1)
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status',
'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
'hours_per_week', 'native_country', 'wage_class']
train_set.columns=col_labels;
test_set.columns=col_labels;
print(train_set.head(n=20))
print(train_set.describe())
print(train_set.std())
plt.boxplot(test_set['age'])
plt.show()
plt.hist(test_set['age'])
plt.show()
err_data=train_set[train_set['age']==0]=np.nan
err_data=train_set.query('age==0')
print(train_set.shape)
train_set[train_set['age']==0]=np.nan
temp_train_set=train_set.dropna()
print(temp_train_set.shape)
final_train_set=train_set.replace({' ?':np.nan,' ':np.nan}).dropna()
print(final_train_set.shape)
test_set['wage_class'] = test_set.wage_class.replace({' <=50K.': ' <=50K', ' >50K.': ' >50K'})
print(test_set.wage_class.unique())
print(final_train_set.wage_class.unique())
temp_test_set=train_set.dropna()
final_test_set=temp_test_set.replace({' ?':np.nan,' ':np.nan}).dropna()
print(final_train_set.wage_class.unique())
print(final_test_set.wage_class.unique())
x_train=final_train_set[['age','education_num','hours_per_week']]
y_train=final_train_set['wage_class']
knn=KNeighborsClassifier()
knn.fit(x_train,y_train)
print(knn.score(x_train,y_train))
x_test=final_test_set[['age','education_num','hours_per_week']].head(n=10)
print("预测值",list(knn.predict(x_test)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(x_train, y_train)
print(rf.score(x_train,y_train))
print("预测值",list(rf.predict(x_test)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))
rf2 = RandomForestClassifier()
x_train2=final_train_set[['age','education_num','hours_per_week','capital_gain','capital_gain']]
x_text2=final_test_set[['age','education_num','hours_per_week','capital_gain','capital_gain']].head(n=10)
rf2.fit(x_train2, y_train)
print(rf2.score(x_train2,y_train))
print("预测值",list(rf2.predict(x_text2)))
print("实际值",list(final_test_set['wage_class'].head(n=10)))
runfile(‘E:/test/t1/t6.py’, wdir=‘E:/test/t1’) age workclass fnlwgt … hours_per_week native_country wage_class 0 0 State-gov 1231231 … 40 United-States <=50K 1 50 Self-emp-not-inc 83311 … 13 United-States <=50K 2 38 Private … 40 United-States <=50K 3 53 Private 234721 … 40 United-States <=50K 4 28 Private 338409 … 40 Cuba <=50K 5 37 Private 284582 … 40 United-States <=50K 6 49 Private 160187 … 16 Jamaica <=50K 7 52 Self-emp-not-inc 209642 … 45 United-States >50K 8 31 Private 45781 … 50 United-States >50K 9 42 Private 159449 … 40 United-States >50K 10 37 Private 280464 … 80 United-States >50K 11 30 State-gov 141297 … 40 India >50K 12 23 Private 122272 … 30 United-States <=50K 13 32 Private 205019 … 50 United-States <=50K 14 40 Private 121772 … 40 ? >50K 15 34 Private 245487 … 45 Mexico <=50K 16 25 Self-emp-not-inc 176756 … 35 United-States <=50K 17 32 Private 186824 … 40 United-States <=50K 18 38 Private 28887 … 50 United-States <=50K 19 43 Self-emp-not-inc 292175 … 45 United-States >50K [20 rows x 15 columns] age education_num capital_gain capital_loss hours_per_week count 32561.000000 32561.000000 32561.000000 32561.000000 32561.000000 mean 38.580449 10.080679 1077.648844 87.303830 40.437456 std 13.642108 2.572720 7385.292085 402.960219 12.347429 min 0.000000 1.000000 0.000000 0.000000 1.000000 25% 28.000000 9.000000 0.000000 0.000000 40.000000 50% 37.000000 10.000000 0.000000 0.000000 40.000000 75% 48.000000 12.000000 0.000000 0.000000 45.000000 max 90.000000 16.000000 99999.000000 4356.000000 99.000000 age 13.642108 education_num 2.572720 capital_gain 7385.292085 capital_loss 402.960219 hours_per_week 12.347429 dtype: float64 (32561, 15) (32560, 15) (30160, 15) [’ <=50K’ ’ >50K’] [’ <=50K’ ’ >50K’] [’ <=50K’ ’ >50K’] [’ <=50K’ ’ >50K’] 0.7995026525198939 预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ <=50K’] 实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’] 0.8387599469496021 预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ <=50K’] 实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’] 0.8661472148541114 预测值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ <=50K’] 实际值 [’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ <=50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’, ’ >50K’]
结论: 1.数据处理:需要根据需要去除或填充异常数据,这里训练数据比较多,其实可以通过klearn的trani_test_split按比例切割一部分数据作为测试数据集 2.相同条件,knn->随机森林score稍有提升,随机森林更适合 3.随机森林增加数据维度后,Socre提升,预测更加准确 4.非整型数据需要转换为整型才能应用于模型训练 5.在klearn中,除了knn,RandomForest,还有决策树等算法使用套路是一样的
|