划分数据集
import sklearn.model_selection as ms
train_x,test_x,train_y,test_y = ms.train_test_split(x,
y,
test_size=0.1,
random_state=7)
线性回归
import numpy as np
import sklearn.linear_model as lm
import sklearn.metrics as sm
import matplotlib.pyplot as mp
train_x = np.array([[0.5], [0.6], [0.8], [1.1], [1.4]])
train_y = np.array([5.0, 5.5, 6.0, 6.8, 7.0])
model = lm.LinearRegression()
model.fit(train_x, train_y)
pred_y = model.predict(train_x)
print("coef_:", model.coef_)
print("intercept_:", model.intercept_)
mp.figure('Linear Regression', facecolor='lightgray')
mp.title('Linear Regression', fontsize=20)
mp.xlabel('x', fontsize=14)
mp.ylabel('y', fontsize=14)
mp.tick_params(labelsize=10)
mp.grid(linestyle=':')
mp.scatter(train_x, train_y, c='blue', alpha=0.8, s=60, label='Sample')
mp.plot(train_x,
pred_y,
c='orangered', label='Regression')
mp.legend()
mp.show()
lasso回归与岭回归
import sklearn.linear_model as lm
model_3 = lm.Lasso(alpha=0.5,
max_iter=1000)
model_3.fit(x, y)
pred_y3 = model_3.predict(x)
print('Ridge_Score:-->',sm.r2_score(test_y,pred_test_y))
model_ridge = lm.Ridge(alpha=100,max_iter=1000)
model_ridge.fit(train_x,train_y)
pred_test_y = model_ridge.predict((test_x))
print('Ridge_Score:-->',sm.r2_score(test_y,pred_test_y))
多项式回归
import matplotlib.pyplot as plt
import pandas as pd
import sklearn.pipeline as pl
import sklearn.preprocessing as sp
import sklearn.linear_model as lm
data = pd.read_csv('./Salary_Data.csv')
x, y = data['YearsExperience'], data['Salary']
train_x = pd.DataFrame(x)
train_y = y
model = pl.make_pipeline(sp.PolynomialFeatures(4),
lm.LinearRegression())
model.fit(train_x, train_y)
pred_train_y = model.predict(train_x)
plt.grid(linestyle=':')
plt.scatter(x, y)
plt.plot(x, pred_train_y, color='orangered')
plt.show()
决策树
Boosting(提升)类: Adaboost正向激励 和 GBDT(梯度提升树–残差)
import sklearn.metrics as sm
from sklearn import tree
model = tree.DecisionTreeRegressor(max_depth=4)
model_ada = se.AdaBoostRegressor(model,
n_estimators=400,
random_state=7)
model_ada.fit(train_x,train_y)
pred_test_y = model_ada.predict(test_x)
print('Ada_Score-->:',sm.r2_score(test_y,pred_test_y))
import sklearn.ensemble as se
model_GBDT = se.GradientBoostingRegressor(max_depth=4,
n_estimators=400,
min_samples_split=5)
model_GBDT.fit(train_x,train_y)
pred_test_y = model_GBDT.predict(test_x)
print('GBDT_Score-->:',sm.r2_score(test_y,pred_test_y))
Bagging (打包)类:
import sklearn.ensemble as se
model_forest = se.RandomForestRegressor(max_depth=10,
n_estimators=1000,
min_samples_split=7)
model_forest.fit(train_x,train_y)
pred_test_y = model_forest.predict(test_x)
print('Forest_Score-->:',sm.r2_score(test_y,pred_test_y))
分类
逻辑回归(鸢尾花)
import sklearn.datasets as sd
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.model_selection as sm
import sklearn.linear_model as lm
Iris = sd.load_iris()
data = pd.DataFrame(Iris.data,
columns=Iris.feature_names)
data['target'] = Iris.target
sub_data = data.tail(100)
x = sub_data.iloc[:,:-1]
y = sub_data['target']
train_x,test_x,train_y,test_y = sm.train_test_split(x,
y,
test_size=0.1,
random_state=7)
model = lm.LogisticRegression()
model.fit(train_x,train_y)
pred_test_y = model.predict(test_x)
print('准确率:',(pred_test_y==test_y).sum() / test_y.size)
决策树
import sklearn.tree as st
model = st.DecisionTreeRegressor(max_depth=4)
model.fit(train_x, train_y)
pre_test_y = model.predict(test_x)
性能度量
import sklearn.metrics as sm
print("recall:", sm.recall_score(test_y,
pred_test_y,
average="macro"))
print("precision:", sm.precision_score(test_y,
pred_test_y,
average="macro"))
print("F1:", sm.f1_score(test_y, pred_test_y,average="macro"))
print(sm.classification_report(y, pred_test_y))
print("\n Confusion Matrix:")
cm = sm.confusion_matrix(test_y, pred_test_y)
print(cm)
混淆矩阵
| A类别 | B类别 | C类别 |
---|
A类别 | 3 | 1 | 1 | B类别 | 0 | 4 | 2 | C类别 | 0 | 0 | 7 |
混淆矩阵也称误差矩阵,是表示精度评价的一种标准格式,用n行n列的矩阵形式来表示。每一行(数量之和)表示一个真实类别的样本,每一列(数量之和)表示一个预测类别的样本。 根据混淆矩阵,查准率、召回率也可表示为: 查准率 = 主对角线上的值 / 该值所在列的和 召回率 = 主对角线上的值 / 该值所在行的和 例子:A类别=5(行),预测A类别=3(列)。 查准率=3/3,召回率=3/5 交叉验证
import sklearn.model_selection as ms
scores = ms.cross_val_score(model,
x,y,
cv=5,
scoring='f1_weighted')
验证曲线(训练之前做) 先交叉验证,在绘制验证曲线评估模型。使用自定义的参数进行测试,用画图的方式来看好坏,决定参数。(一次只能验证一个参数)
import sklearn.model_selection as ms
import matplotlib.pyplot as plt
params = np.arange(50,551,50)
train_score,test_score = ms.validation_curve(model,
x,
y,
'n_estimators',
params,
cv=5)
mean_score = test_score.mean(axis=1)
print(mean_score)
plt.grid(linestyle=':')
plt.plot(params,
mean_score,
'o-',
color='dodgerblue',
label='n_ensamble')
plt.legend()
plt.show()
学习曲线(训练之前做)
import sklearn.model_selection as ms
params = np.arange(0.1,1.1,0.1)
_,train_score,test_score = ms.learning_curve(model,
x,
y,
train_sizes=params,
cv=5)
mean_score = test_score.mean(axis=1)
plt.grid(linestyle=':')
plt.plot(params,mean_score,'o-')
plt.xticks(params)
plt.show()
支持向量机
import sklearn.svm as svm
model = svm.SVC(kernel="linear")
model = svm.SVC(kernel="poly", degree=3)
model = svm.SVC(kernel="rbf",
gamma=0.01,
C=600)
网格搜索
import sklearn.model_selection as ms
params = [
{"kernel": ["linear"],
"C": [1, 10, 100, 1000]
},
{"kernel": ["poly"],
"C": [1],
"degree": [2, 3]
},
{"kernel": ["rbf"],
"C": [1, 10, 100, 1000],
"gamma": [1, 0.1, 0.01, 0.001]
}
]
model = ms.GridSearchCV(svm.SVC(), params, cv=5)
model.fit(train_x,train_y)
pred_test_y = model.predict(test_x)
print('最优的模型参数:',model.best_params_)
print('最优的得分:',model.best_score_)
实现朴素贝叶斯分类器
在sklearn中,提供了三个朴素贝叶斯分类器,分别是:
- GaussianNB(高斯朴素贝叶斯分类器):适合用于样本的值是连续的,数据呈正态分布的情况(比如人的身高、城市家庭收入、一次考试的成绩等等)
- MultinominalNB(多项式朴素贝叶斯分类器):适合用于大部分属性为离散值的数据集
- BernoulliNB(伯努利朴素贝叶斯分类器):适合用于特征值为二元离散值或是稀疏的多元离散值的数据集
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.model_selection as ms
import sklearn.naive_bayes as nb
import sklearn.metrics as sm
data = pd.read_csv('../data_test/multiple1.txt',
header=None,
names=['x1','x2','y'])
x = data.iloc[:,:-1]
y = data['y']
train_x,test_x,train_y,test_y = ms.train_test_split(x,
y,
test_size=0.2,
random_state=7,
stratify=y)
model = nb.GaussianNB()
model.fit(train_x,train_y)
pred_test_y = model.predict(test_x)
print(sm.classification_report(test_y,pred_test_y))
xs = np.linspace(data['x1'].min(),data['x1'].max(),100)
ys = np.linspace(data['x2'].min(),data['x2'].max(),100)
poins = []
for x in xs:
for y in ys:
poins.append([x,y])
points = np.array(poins)
points_labels = model.predict(points)
plt.scatter(points[:,0],points[:,1],c=points_labels,cmap='gray')
plt.scatter(test_x['x1'],test_x['x2'],c=test_y)
plt.show()
K均值聚类(k-means)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.read_csv('../data_test/multiple3.txt', header=None)
plt.scatter(data[0],data[1])
import sklearn.cluster as sc
model = sc.KMeans(n_clusters=4)
model.fit(data)
labels = model.labels_
print('聚类结果:',labels)
centers = model.cluster_centers_
print('聚类中心:',centers)
plt.scatter(data[0],data[1],c=labels,cmap='brg')
center_x = centers[:,0]
center_y = centers[:,-1]
plt.scatter(center_x,center_y,c='black',marker='+',s=500)
plt.show()
密度聚类-噪声密度(DBSCAN)
model = sc.DBSCAN(eps,
min_samples)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.metrics as sm
data = pd.read_csv('../data_test/multiple3.txt', header=None)
plt.scatter(data[0],data[1])
import sklearn.cluster as sc
params = np.arange(0.5,1,0.1)
params_list = []
model_list = []
score_list = []
for i in params:
model = sc.DBSCAN(eps=i,min_samples=5)
model.fit(data)
labels = model.labels_
score = sm.silhouette_score(data,
labels,
sample_size=len(data))
model_list.append(model)
score_list.append(score)
params_list.append(i)
best_ind = np.argmax(score_list)
print(model_list[best_ind])
print(params_list[best_ind])
print(score_list[best_ind])
|