机器学习分类问题全流程
冀以尘雾之微补益山海,荧烛末光增辉日月。 ——2022/1/27
导入类库
导入可视化模块、机器学习库中的模型评估模块和模型库模块;以方便后续使用。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import read_csv
from pandas.plotting import scatter_matrix
from pandas import set_option
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
导入数据
将要用的数据集导入,本文以玻璃数据集glass_train.csv为例。
filename = 'glass_train.csv'
dataset = read_csv(filename, header=0)
tag = read_csv('glass_train_labels.csv', header=0)
df=pd.concat([dataset,tag],axis=1)
df.columns=['RI','Na','Mg','AI','Si','K','Ca','Ba','Fe','tag']
df
数据探查与预处理
这里主要介绍简单的数据处理,对于复杂的数据集需要对数据做特征工程进行进一步的处理。
print(df .shape)
df.describe()
df.dtypes
df.isna().sum()
print(tag.value_counts())
df.corr(method='pearson')
sns.heatmap(df.corr())
plt.show()
sns.kdeplot(df["tag"] , color="#1874CD",label="tag_label", alpha=.7)
plt.grid()
plt.show()
array = df.values
X = array[:, 0:9].astype(float)
Y = array[:, 9]
validation_size = 0.2
seed = 7
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size=validation_size, random_state=seed)
算法审查-训练模型
确定评估算法的基准,利用传统的机器学习算法进行训练并比较训练结果
num_folds = 10
seed = 7
scoring = 'accuracy'
models = {}
models['LR'] = LogisticRegression()
models['LDA'] = LinearDiscriminantAnalysis()
models['KNN'] = KNeighborsClassifier()
models['CART'] = DecisionTreeClassifier()
models['SVM'] = SVC()
results = []
for key in models:
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
cv_results = cross_val_score(models[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(models.keys())
pyplot.show()
pipelines = {}
pipelines['ScalerLR'] = Pipeline([('Scaler', StandardScaler()), ('LR', LogisticRegression())])
pipelines['ScalerLDA'] = Pipeline([('Scaler', StandardScaler()), ('LDA', LinearDiscriminantAnalysis())])
pipelines['ScalerKNN'] = Pipeline([('Scaler', StandardScaler()), ('KNN', KNeighborsClassifier())])
pipelines['ScalerCART'] = Pipeline([('Scaler', StandardScaler()), ('CART', DecisionTreeClassifier())])
pipelines['ScalerNB'] = Pipeline([('Scaler', StandardScaler()), ('NB', GaussianNB())])
results = []
for key in pipelines:
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
cv_results = cross_val_score(pipelines[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_results)
print('%s : %f (%f)' % (key, cv_results.mean(), cv_results.std()))
fig = plt.figure()
fig.suptitle('Scaled Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(pipelines.keys())
plt.show()
ensembles = {}
ensembles['ScaledAB'] = Pipeline([('Scaler', StandardScaler()), ('AB', AdaBoostClassifier())])
ensembles['ScaledGBM'] = Pipeline([('Scaler', StandardScaler()), ('GBM', GradientBoostingClassifier())])
ensembles['ScaledRF'] = Pipeline([('Scaler', StandardScaler()), ('RFR', RandomForestClassifier())])
ensembles['ScaledET'] = Pipeline([('Scaler', StandardScaler()), ('ETR', ExtraTreesClassifier())])
results = []
for key in ensembles:
kfold = KFold(n_splits=num_folds, random_state=seed,shuffle=True)
cv_result = cross_val_score(ensembles[key], X_train, Y_train, cv=kfold, scoring=scoring)
results.append(cv_result)
print('%s: %f (%f)' % (key, cv_result.mean(), cv_result.std()))
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(ensembles.keys())
plt.show()
通过以上结果进行比较分析,得到结果最好的模型。
模型调参
from sklearn.ensemble import GradientBoostingClassifier
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
param_grid = {'n_estimators': [10, 50, 100, 200, 300, 400, 500, 600, 700, 800, 900]}
model = GradientBoostingClassifier()
kfold = KFold(n_splits=num_folds,shuffle=True, random_state=seed)
grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X=rescaledX, y=Y_train)
print('最优:%s 使用%s' % (grid_result.best_score_, grid_result.best_params_))
模型最终化&模型评估
根据上述的模型结果选择最佳的模型,以下只是一个参考。
from lightgbm import LGBMClassifier
scaler = StandardScaler().fit(X_train)
rescaledX = scaler.transform(X_train)
model = LGBMClassifier()
model.fit(X=rescaledX, y=Y_train)
rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(predictions)
a = pd.DataFrame()
a['预测值'] = list(predictions)
a['实际值'] = list(Y_validation)
a.head()
features = ['RI','Na','Mg','AI','Si','K','Ca','Ba','Fe']
importances = model.feature_importances_
importances_df = pd.DataFrame()
importances_df['特征名称'] = features
importances_df['特征重要性'] = importances
importances_df.sort_values('特征重要性', ascending=False)
rescaled_validationX = scaler.transform(X_validation)
predictions = model.predict(rescaled_validationX)
print(accuracy_score(Y_validation, predictions))
print(confusion_matrix(Y_validation, predictions))
print(classification_report(Y_validation, predictions))
参考书籍:Python机器学习实战案例_赵卫东,董亮编著_2019.12 全书源代码:https://github.com/weizy1981/MachineLearning
本文链接:https://blog.csdn.net/qq_46426207/article/details/122723777
|