###导入模块
import numpy as np
import pandas as pd
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
###读入数据
names=("x1","x2","x3","y_label")
dataset=pd.read_excel('./filename.xlsx',names=feature_name,header=None)
#dataset=pd.read_csv('./filename.csv')
###数据处理
#提取自变量与因变量
X=dataset.iloc[:,:-1] #用iloc函数(依据行列号)对数据进行切片(或选择)
Y=dataset["label"]
#数据集划分
train_X,test_X,train_Y,test_Y=train_test_split(X,Y,test_size=0.2) #使用train_test_split划分数据集
print(train_X.shape)
print(test_X.shape)
#标准化数据
scaler = StandardScaler().fit(train_X) # fit生成规则
X_trainScaler = scaler.transform(train_X) # 将规则应用于训练集
X_testScaler = scaler.transform(test_X) # 将规则应用于测试集
###决策树模型
clf=tree.DecisionTreeClassifier() #使用默认参数
clf=clf.fit(X_trainScaler,train_Y) #训练模型
prediction=clf.predict(X_testScaler) #预测结果
print('accuracy=')
metrics.accuracy_score(prediction,test_Y)
###网格搜索寻参
parameters = {'splitter':('best','random')
,'criterion':("gini","entropy")
,"max_depth":[*range(1,15)]
,'min_samples_leaf':[*range(1,50,5)]
,'min_impurity_decrease':np.arange(0,0.05,0.00101)
}
#参数范围(字典类型)
clf = tree.DecisionTreeClassifier(criterion='gini', #划分标准gini指数
random_state=25) #随机种子
GS = GridSearchCV(clf, #模型
parameters, #参数字典
n_jobs=1, #使用一个cpu
verbose=0, #不打印中间过程
cv=10) # #10折cv交叉验证
GS.fit(X_trainScaler,train_Y)# 在训练集上进行网格搜索
# GS.cv_results_ #打印模型搜索结果
# sorted(GS.cv_results_.keys()) #打印模型搜索结果的关键词
GS.best_params_,GS.best_score_,GS.score(X_testScaler,test_Y) #打印模型的最佳参数,最佳得分以及测试集的得分
参考链接 网格搜索参考链接 https://www.cnblogs.com/wanglei5205/p/8581354.htmllink sklearn官网https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html link
|