半监督KMeans
KMeans是无监督的。当然也可以是有监督的。有监督形式非常简单。就是根据labels计算聚类中心即可。相当于无监督KMeans的半步迭代。
本文贡献的是半监督KMeans。半监督KMeans可以充分利用已知的labels信息。在机器学习里,有利于将人类知识和机器从数据发现的知识相互融合。
算法
输入点集
D
0
=
{
(
x
i
,
c
i
)
}
,
D
1
=
{
x
i
′
}
D_0=\{(x_i,c_i)\}, D_1=\{x_i'\}
D0?={(xi?,ci?)},D1?={xi′?}? 输出分类器(或聚类中心)
令
C
0
=
{
c
i
}
,
C
1
=
C
?
C
0
C_0=\{c_i\}, C_1=C\setminus C_0
C0?={ci?},C1?=C?C0?, 下述迭代不改变
γ
(
x
i
)
=
c
i
,
(
x
i
,
c
i
)
∈
D
0
\gamma(x_i)=c_i,(x_i,c_i)\in D_0
γ(xi?)=ci?,(xi?,ci?)∈D0?。
- 根据
D
1
D_1
D1?,初始化聚类中心
{
μ
c
,
c
∈
C
1
}
\{\mu_c,c\in C_1\}
{μc?,c∈C1?}????????,再根据
D
2
D_2
D2?,初始化聚类中心
{
μ
c
,
c
∈
C
1
}
\{\mu_c,c\in C_1\}
{μc?,c∈C1?};如下初始化聚类中心
{
μ
c
,
c
∈
C
0
}
\{\mu_c,c\in C_0\}
{μc?,c∈C0?}:
μ
c
=
1
?
{
x
∈
D
0
∣
γ
(
x
)
=
c
}
∑
γ
(
x
)
=
c
,
x
∈
D
0
x
,
c
∈
C
0
;
\mu_c=\frac{1}{\sharp\{x\in D_0|\gamma(x)=c\}}\sum_{\gamma(x)=c,x\in D_0}x,c\in C_0;
μc?=?{x∈D0?∣γ(x)=c}1?γ(x)=c,x∈D0?∑?x,c∈C0?; - 重置分类结果
γ
(
x
)
=
arg
?
min
?
c
∈
C
∥
x
?
μ
c
∥
,
x
∈
D
1
\gamma(x)=\arg\min_{c\in C} \|x-\mu_c\|,x\in D_1
γ(x)=argminc∈C?∥x?μc?∥,x∈D1?;
- 更新中心
μ
c
=
1
N
c
∑
γ
(
x
)
=
c
x
\mu_c=\frac{1}{N_c}\sum_{\gamma(x)=c}x
μc?=Nc?1?∑γ(x)=c?x;(包括
D
0
,
D
1
D_0,D_1
D0?,D1?中的x)
- 重复 2-3 直到收敛;
和无监督的KMeans相比,这里唯一复杂的是初始化。如果
C
0
C_0
C0?不包括所有类别,那么首先给
C
1
C_1
C1?指定聚类中心,如在
D
1
D_1
D1?中随机选择,然后
D
0
D_0
D0?中每个类的中心作为
C
0
C_0
C0?的聚类中心(退化为一个有监督的分类算法)。
代码
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.cluster import KMeans, kmeans_plusplus
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.model_selection import train_test_split
from sklearn import datasets
digists = datasets.load_digits()
X_train, X_test, y_train, y_test = train_test_split(digists.data, digists.target, test_size=0.5)
X_train0, X_train1, y_train0, _ = train_test_split(X_train, y_train, test_size=0.95)
class SupervisedKMeans(ClassifierMixin, KMeans):
def fit(self, X, y):
self.classes = np.unique(y)
self.centers_ = np.array([np.mean(X[y==c], axis=0) for c in self.classes])
self.cluster_centers_ = self.centers_
return self
def predict(self, X):
ed = euclidean_distances(X, self.cluster_centers_)
return [self.classes[k] for k in np.argmin(ed, axis=1)]
def score(self, X, y):
y_ = self.predict(X)
return np.mean(y == y_)
class SemiKMeans(SupervisedKMeans):
def fit(self, X0, y0, X1):
"""To fit the semisupervised model
Args:
X0 (array): input variables with labels
y0 (array): labels
X1 (array): input variables without labels
Returns:
the model
"""
classes0 = np.unique(y0)
classes1 = np.setdiff1d(np.arange(self.n_clusters), classes0)
self.classes = np.concatenate((classes0, classes1))
X = np.row_stack((X0, X1))
n1 = len(classes1)
mu0 = SupervisedKMeans().fit(X0, y0).centers_
if n1:
centers, indices = kmeans_plusplus(X1, n_clusters=n1)
self.cluster_centers_ = np.row_stack((centers, mu0))
else:
self.cluster_centers_ = mu0
for _ in range(30):
ED = euclidean_distances(X1, self.cluster_centers_)
y1 = [self.classes[k] for k in np.argmin(ED, axis=1)]
y = np.concatenate((y0, y1))
self.cluster_centers_ = np.array([np.mean(X[y==c], axis=0) for c in self.classes])
return self
if __name__ == '__main__':
km = SemiKMeans(n_clusters=10)
km.fit(X_train0, y_train0, X_train1)
skm = SupervisedKMeans(n_clusters=10)
skm.fit(X_train0, y_train0)
print(f"""
# clusters: 10
# samples: {X_train0.shape[0]} + {X_train1.shape[0]}
SemiKMeans: {km.score(X_test, y_test)}
SupervisedKMeans: {skm.score(X_test, y_test)}
""")
# clusters: 10
# samples: 44 + 854
SemiKMeans: 0.7975528364849833
SupervisedKMeans: 0.7675194660734149
|