一、SVM
支持向量机(support vector machines)是一种二分类模型,它的目的是寻找一个超平面来对样本进行分割,分割的原则是间隔最大化,最终转化为一个凸二次规划问题来求解。由简至繁的模型包括:
- 当训练样本线性可分时,通过硬间隔最大化,学习一个线性可分支持向量机;
- 当训练样本近似线性可分时,通过软间隔最大化,学习一个线性支持向量机;
- 当训练样本线性不可分时,通过核技巧和软间隔最大化,学习一个非线性支持向量机;
二、LDA
线性判别分析(Linear Discriminant Analysis,简称LDA)是一种经典的有监督数据降维方法。LDA的主要思想是将一个高维空间中的数据投影到一个较低维的空间中,且投影后要保证各个类别的类内方差小而类间均值差别大,这意味着同一类的高维数据投影到低维空间后相同类别的聚在一起,而不同类别之间相距较远。
三、SVM数据集进行可视化分类
1.月亮数据集
1.线性SVM
- 导入包
from sklearn.datasets import make_moons
from sklearn.svm import LinearSVC
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import StandardScaler
- 进行标准化并训练数据
scaler=StandardScaler()
data_x, data_y = make_moons(n_samples=100, noise=0.15, random_state=42)
scaler.fit(data_x)
data_x=scaler.transform(data_x)
liner_svc=LinearSVC(C=1e9,max_iter=1000000)
liner_svc.fit(data_x,data_y)
- 绘制边界函数,进行后续可视化分类
def plot_decision_boundary(model,axis):
x0,x1=np.meshgrid(
np.linspace(axis[0],axis[1],int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[2],axis[3],int((axis[3]-axis[2])*100)).reshape(-1,1))
x_new=np.c_[x0.ravel(),x1.ravel()]
y_predict=model.predict(x_new)
zz=y_predict.reshape(x0.shape)
custom_cmap=ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0,x1,zz,cmap=custom_cmap)
- 画图以及输出参数权重和模型截距
plot_decision_boundary(liner_svc,axis=[-3,3,-3,3])
plt.scatter(data_x[data_y==0,0],data_x[data_y==0,1],color='red')
plt.scatter(data_x[data_y==1,0],data_x[data_y==1,1],color='blue')
plt.show()
print('参数权重')
print(liner_svc.coef_)
print('模型截距')
print(liner_svc.intercept_)
参数权重
[[ 0.25575181 -1.05403425]]
模型截距
[-0.89134169]
2.高斯核
- 导入包
from sklearn import datasets
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
- 定义RBF核的SVM函数
def RBFKernelSVC(gamma=1.0):
return Pipeline([
('std_scaler',StandardScaler()),
('svc',SVC(kernel='rbf',gamma=gamma))
])
- 定义RBF核的SVM函数
def RBFKernelSVC(gamma=1.0):
return Pipeline([
('std_scaler',StandardScaler()),
('svc',SVC(kernel='rbf',gamma=gamma))
])
- 得出结果
svc=RBFKernelSVC(gamma=100)
svc.fit(data_x,data_y)
print("系数w:",svc.named_steps['svc'].dual_coef_)
print("截距b:",svc.named_steps['svc'].intercept_)
plot_decision_boundary(svc,axis=[-2.5,2.5,-2.5,2.5])
plt.scatter(data_x[data_y==0,0],data_x[data_y==0,1],color='red')
plt.scatter(data_x[data_y==1,0],data_x[data_y==1,1],color='blue')
plt.show()
系数w: [[-1. -1. -0.51093449 -1. -0.822291 -1.
-0.75412874 -0.43835686 -0.90471477 -1. -0.64919767 -0.46067276
-1. -1. -0.95980952 -0.95965747 -1. -1.
-0.93110072 -1. -0.77514095 -0.73844853 -0.99369334 -1.
-0.96445949 -0.80367589 -0.58692433 -0.98275503 -1. -0.4721685
-0.96204829 -1. -0.24099873 -0.71333937 -0.85722593 -1.
-0.34834249 -1. -1. -0.79792194 -0.93433917 -0.81403221
-0.2438666 -1. -1. -1. -1. -1.
-0.5435427 -0.6182202 0.91203669 0.66533876 0.58438701 0.81456451
0.96723538 0.78155026 0.95696874 0.56528228 0.96716402 0.962564
0.94872729 0.83212967 0.85949483 0.95911018 0.96748936 0.57884142
0.96122 0.85724467 0.91833655 0.6391854 0.96303112 0.96571031
0.50021749 0.93670053 0.96723884 1. 0.33905587 0.71414804
0.97016942 0.90878648 0.96724545 0.80554748 0.96066593 0.7674331
0.93589343 0.85926267 0.82752165 0.95204831 0.83183969 0.74708972
0.96684025 0.1406111 0.96647257 0.91517994 0.84336219 0.96002998
0.93738769 0.56541788 0.96238301 0.90584648]]
截距b: [0.03262409]
3.多项式核
- 导入包
from sklearn import datasets
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,PolynomialFeatures
- 构建月亮数据集并可视化
X, y = datasets.make_moons()
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
- 生成噪声点并可视化
X, y = datasets.make_moons(noise=0.15,random_state=777)
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
- 定义非线性SVM函数
def PolynomialSVC(degree,C=1.0):
return Pipeline([
("poly",PolynomialFeatures(degree=degree)),
("std_scaler",StandardScaler()),
("linearSVC",LinearSVC(C=C))
])
- 调用PolynomialSVC函数进行分类可视化
poly_svc = PolynomialSVC(degree=5)
poly_svc.fit(X,y)
print("权重w:",poly_svc.named_steps['linearSVC'].coef_[0])
print("截距b:",poly_svc.named_steps['linearSVC'].intercept_[0])
plot_decision_boundary(poly_svc,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
权重w: [ 0. -0.02742073 -0.23525989 -2.25346789 -0.22587913 -0.28117146
0.80263765 -1.09790627 0.5707253 -0.51768427 0.80933128 -0.02821751
-0.00701927 0.16110953 -0.38610344 1.32084844 0.15581453 0.72184688
-0.17536827 0.14240683 -0.25174644]
截距b: -0.1627538909845031
- 核处理
def PolynomialKernelSVC(degree,C=1.0):
return Pipeline([
("std_scaler",StandardScaler()),
("kernelSVC",SVC(kernel="poly"))
])
poly_kernel_svc = PolynomialKernelSVC(degree=5)
poly_kernel_svc.fit(X,y)
print("系数w:",poly_kernel_svc .named_steps['kernelSVC'].dual_coef_)
print("截距b:",poly_kernel_svc .named_steps['kernelSVC'].intercept_)
plot_decision_boundary(poly_kernel_svc,axis=[-1.5,2.5,-1.0,1.5])
plt.scatter(X[y==0,0],X[y==0,1])
plt.scatter(X[y==1,0],X[y==1,1])
plt.show()
系数w: [[-1. -1. -1. -1. -1. -0.85433315
-1. -1. -1. -1. -1. -1.
-1. -1. -1. -1. -0.24006638 1.
1. 1. 1. 1. 1. 1.
1. 1. 1. 1. 1. 1.
1. 1. 0.09439953 1. ]]
截距b: [-0.21562245]
二、鸢尾花数据集
1.预准备
- 导入相关库
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from matplotlib.colors import ListedColormap
- 绘制边界函数
def plot_decision_boundary(model, axis):
x0, x1 = np.meshgrid(
np.linspace(axis[0], axis[1], int((axis[1]-axis[0])*100)).reshape(-1,1),
np.linspace(axis[2], axis[3], int((axis[3]-axis[2])*100)).reshape(-1,1)
)
X_new = np.c_[x0.ravel(), x1.ravel()]
y_predict = model.predict(X_new)
zz = y_predict.reshape(x0.shape)
from matplotlib.colors import ListedColormap
custom_cmap = ListedColormap(['#EF9A9A','#FFF59D','#90CAF9'])
plt.contourf(x0, x1, zz, cmap=custom_cmap)
- 定义SVM函数
def PolynomialSVC(degree,C=1.0):
return Pipeline([
("poly",PolynomialFeatures(degree=degree)),
("std_scaler",StandardScaler()),
("linearSVC",LinearSVC(C=C))
])
def PolynomialKernelSVC(degree,C=1.0):
return Pipeline([
("std_scaler",StandardScaler()),
("kernelSVC",SVC(kernel="poly"))
])
def RBFKernelSVC(gamma=1.0):
return Pipeline([
('std_scaler',StandardScaler()),
('svc',SVC(kernel='rbf',gamma=gamma))
])
- 绘制基本鸢尾花数据集
iris = datasets.load_iris()
x_iris = iris.data
y_iris = iris.target
x_iris = x_iris [y_iris<2,:2]
y_iris = y_iris[y_iris<2]
plt.scatter(x_iris[y_iris==0,0],x_iris[y_iris==0,1])
plt.scatter(x_iris[y_iris==1,0],x_iris[y_iris==1,1])
plt.show()
2. 高斯核
rbf_svc_iris = RBFKernelSVC(1)
rbf_svc_iris.fit(x_iris,y_iris)
print("系数w:",rbf_svc_iris.named_steps['svc'].dual_coef_)
print("截距b:",rbf_svc_iris.named_steps['svc'].intercept_)
plot_decision_boundary(rbf_svc_iris,axis=[4,7.5,1,4.5])
plt.scatter(x_iris[y_iris==0,0],x_iris[y_iris==0,1])
plt.scatter(x_iris[y_iris==1,0],x_iris[y_iris==1,1])
plt.show()
系数w: [[-0.76759127 -0.47423075 -0.2011925 -0.77968419 -0.31804531 -0.50982507
-1. -0.80508958 -0.35824411 -0.74252356 -1. 0.61031154
1. 0.79408519 0.70381878 0.74512265 0.18015706 0.07385213
0.47768741 1. 1. 0.37139158]]
截距b: [0.0900694]
3. 多项式核
poly_svc_iris = PolynomialSVC(degree=5,C=10)
poly_svc_iris.fit(x_iris,y_iris)
print("权重w:",poly_svc_iris.named_steps['linearSVC'].coef_[0])
print("截距b:",poly_svc_iris.named_steps['linearSVC'].intercept_[0])
plot_decision_boundary(poly_svc_iris,axis=[4,7.5,1,4.5])
plt.scatter(x_iris[y_iris==0,0],x_iris[y_iris==0,1])
plt.scatter(x_iris[y_iris==1,0],x_iris[y_iris==1,1])
plt.show()
E:\HP\AppData\anaconda3\envs\myVirtual1\lib\site-packages\sklearn\svm\_base.py:1199: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
warnings.warn(
权重w: [ 0. 1.20794263 -0.34680783 0.92504893 0.24565806 -0.42660309
0.67749885 0.36385391 -0.22305894 -0.47108667 0.46589622 0.28367679
-0.11281594 -0.3892156 -0.48643102 0.28920783 0.16819633 -0.09051419
-0.3400749 -0.4493347 -0.47995201]
截距b: 0.6436143779310682
四、参考
支持向量机(SVM)入门理解与推导
https://blog.csdn.net/junseven164/article/details/121037899
支持向量机&鸢尾花Iris数据集的SVM线性分类练习
|