原理参考: K-means聚类算法原理及python实现 Sklearn之KMeans算法
python实现:
import random
import pandas as pd
import numpy as np
class KMeans:
def __init__(self, dataSet, k):
self.dataSet = dataSet
self.k = k
def calcDis(self, centroids):
clalist=[]
for data in self.dataSet:
diff = np.tile(data, (self.k, 1)) - centroids
squaredDiff = diff ** 2
squaredDist = np.sum(squaredDiff, axis=1)
distance = squaredDist ** 0.5
clalist.append(distance)
clalist = np.array(clalist)
return clalist
def classify(self, centroids):
clalist = self.calcDis(centroids)
minDistIndices = np.argmin(clalist, axis=1)
newCentroids = pd.DataFrame(self.dataSet).groupby(minDistIndices).mean()
newCentroids = newCentroids.values
changed = newCentroids - centroids
return changed, newCentroids
def predict(self):
centroids = self.dataSet[np.random.choice(self.dataSet.shape[0], size=self.k, replace=False), :]
changed, newCentroids = self.classify(centroids)
while np.any(changed != 0):
changed, newCentroids = self.classify(newCentroids)
centroids = newCentroids.tolist()
cluster = []
clalist = self.calcDis(centroids)
minDistIndices = np.argmin(clalist, axis=1)
for i in range(self.k):
cluster.append([])
for i, j in enumerate(minDistIndices):
cluster[j].append(self.dataSet[i])
return centroids, cluster
if __name__=='__main__':
x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
kmeans = KMeans(x, 2)
centroids, cluster = kmeans.predict()
print('质心为:%s' % centroids)
print('集群为:%s' % cluster)
python调包:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn import metrics
import matplotlib.pyplot as plt
x = np.array([[1, 1], [1, 2], [2, 1], [6, 4], [6, 3], [5, 4]])
k_means = KMeans(n_clusters=2)
k_means.fit(x)
y_predict = k_means.predict(x)
plt.scatter(x[:,0],x[:,1],c=y_predict)
plt.show()
print(k_means.predict((x[:,:])))
print(k_means.cluster_centers_)
print(k_means.inertia_)
print(metrics.silhouette_score(x,y_predict))
C++实现:
#include <iostream>
#include <vector>
#include <time.h>
void printMat(std::vector<std::vector<float>> mat)
{
for (size_t i = 0; i < mat.size(); i++)
{
for (size_t j = 0; j < mat[0].size(); j++)
{
std::cout << mat[i][j] << " ";
}
std::cout << std::endl;
}
std::cout << std::endl;
}
bool checkZeros(std::vector<std::vector<float>> mat)
{
bool flag = true;
for (size_t i = 0; i < mat.size(); i++)
{
for (size_t j = 0; j < mat[0].size(); j++)
{
if (mat[i][j] != 0) flag = false;
}
}
return flag;
}
std::vector<int> getminDistIndices(std::vector<std::vector<float>> clalist)
{
std::vector<int> minDistIndices(clalist.size());
for (size_t i = 0; i < clalist.size(); i++)
{
float minDist = INT_MAX;
int minDistIndex = 0;
for (size_t j = 0; j < clalist[0].size(); j++)
{
if (clalist[i][j] < minDist)
{
minDist = clalist[i][j];
minDistIndex = j;
}
}
minDistIndices[i] = minDistIndex;
}
return minDistIndices;
}
class KMeans
{
public:
KMeans(std::vector<std::vector<float>> dataSet, int k) :m_dataSet(dataSet), m_k(k) {};
std::vector<std::vector<float>> calcDis(std::vector<std::vector<float>> centroids)
{
std::vector<std::vector<float>> clalist;
for (auto data : m_dataSet)
{
std::vector<std::vector<float>> diff(m_k);
for (size_t i = 0; i < diff.size(); i++)
{
diff[i] = data;
}
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
diff[i][j] -= centroids[i][j];
diff[i][j] = pow(diff[i][j], 2);
}
}
std::vector<float> squaredDist(diff.size());
for (size_t i = 0; i < diff.size(); i++)
{
for (size_t j = 0; j < diff[0].size(); j++)
{
squaredDist[i] += diff[i][j];
}
squaredDist[i] = sqrt(squaredDist[i]);
}
clalist.push_back(squaredDist);
}
return clalist;
}
void classify(std::vector<std::vector<float>> centroids, std::vector<std::vector<float>>& newCentroids, std::vector<std::vector<float>>& changed)
{
std::vector<std::vector<float>> clalist = calcDis(centroids);
std::vector<int> minDistIndices = getminDistIndices(clalist);
newCentroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < m_dataSet[0].size(); i++)
{
std::vector<float> sum(m_k);
std::vector<int> num(m_k, 0);
for (size_t j = 0; j < m_dataSet.size(); j++)
{
sum[minDistIndices[j]] += m_dataSet[j][i];
++num[minDistIndices[j]];
}
for (size_t j = 0; j < m_k; j++)
{
newCentroids[j][i] = sum[j] / num[j];
}
}
changed.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < changed.size(); i++)
{
for (size_t j = 0; j < changed[0].size(); j++)
{
changed[i][j] = newCentroids[i][j] - centroids[i][j];
}
}
}
void predict(std::vector<std::vector<float>>& centroids, std::vector<std::vector<std::vector<float>>>& cluster)
{
srand((unsigned)time(NULL));
std::vector<int> random_indices;
while (random_indices.size() < m_k)
{
int random_index = rand() % m_dataSet.size();
if(find(random_indices.begin(), random_indices.end(), random_index)== random_indices.end())
random_indices.push_back(random_index);
}
centroids.resize(m_k, std::vector<float>(m_dataSet[0].size()));
for (size_t i = 0; i < m_k; i++)
{
centroids[i] = m_dataSet[random_indices[i]];
}
std::vector<std::vector<float>> newCentroids;
std::vector<std::vector<float>> changed;
classify(centroids, newCentroids, changed);
while (!checkZeros(changed))
{
std::vector<std::vector<float>> copyCentroids = newCentroids;
classify(copyCentroids, newCentroids, changed);
}
centroids = newCentroids;
std::vector<std::vector<float>> clalist = calcDis(newCentroids);
std::vector<int> minDistIndices = getminDistIndices(clalist);
cluster.resize(m_k);
for (size_t i = 0; i < minDistIndices.size(); i++)
{
cluster[minDistIndices[i]].push_back(m_dataSet[i]);
}
}
private:
std::vector<std::vector<float>> m_dataSet;
int m_k;
};
int main(int argc, char* argv[])
{
std::vector<std::vector<float>> dataSet = { {1, 1},{1, 2},{2, 1},{6, 4},{6, 3},{5, 4} };
int k = 2;
KMeans kmeans = KMeans(dataSet, k);
std::vector<std::vector<float>> centroids;
std::vector<std::vector<std::vector<float>>> cluster;
kmeans.predict(centroids, cluster);
printMat(centroids);
printMat(cluster[0]); printMat(cluster[1]);
system("pause");
return EXIT_SUCCESS;
}
|