四:Python实现
算法代码
cluster :该列表用于标识某个数据点属于哪一个簇;例如cluster[3] = 2 , cluster[252] = 1, cluster[26] = 2 标识3、252、26号数据点分别属于簇2、1、2,很显然3号和26号数据点属于同一个簇centroids :该列表用于记录质心,保存的是质心数据,便于在可视化时标记
import numpy as np
'''
1:随机初始化k个质心
2:计算每个样本点到这k个质心的距离,然后把它们分配到距其最近的质心所在的簇中
3:针对每个簇,计算属于该簇的所有样本的均值作为新的质心
4:重复步骤2和3,直到质心不再发生变化
'''
def centorids_init(data_set, k):
examples_nums = np.shape(data_set)[0]
random_ids = np.random.permutation(examples_nums)
centorids = data_set[random_ids[:k], :]
return centorids
def compute_cluster(data_set, centorids):
examples_nums = np.shape(data_set)[0]
centorids_nums = np.shape(centorids)[0]
cluster = np.zeros((examples_nums, 1))
for examples_index in range(examples_nums):
distance = np.zeros(centorids_nums)
for centorids_index in range(centorids_nums):
distance[centorids_index] = np.sqrt(np.sum(
np.power(data_set[examples_index, :]-centorids[centorids_index, :], 2)))
cluster[examples_index] = np.argmin(distance)
return cluster
def renew_centoids(data_set, cluster, k):
features_num = data_set.shape[1]
centorids = np.zeros((k, features_num))
for centroid_id in range(k):
closest_ids = cluster == centroid_id
centorids[centroid_id] = np.mean(data_set[closest_ids.flatten(), :], axis=0)
return centorids
def k_means(data_set, k, max_iterations):
examples_nums = np.shape(data_set)[0]
centorids = centorids_init(data_set, k)
cluster = np.zeros(examples_nums)
for _ in range(max_iterations):
cluster = compute_cluster(data_set, centorids)
centorids = renew_centoids(data_set, cluster, k)
return centorids, cluster
五:效果展示
(1)Iris数据集(较好)
import pandas as pd
import matplotlib.pyplot as plt
import KMeans2
import numpy as np
Iris_types = ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
Iris_data = pd.read_csv('./Iris.csv')
x_axis = 'PetalLengthCm'
y_axis = 'PetalWidthCm'
examples_num = Iris_data.shape[0]
train_data = Iris_data[[x_axis, y_axis]].values.reshape(examples_num, 2)
min_vals = train_data.min(0)
max_vals = train_data.max(0)
ranges = max_vals - min_vals
normal_data = np.zeros(np.shape(train_data))
nums = train_data.shape[0]
normal_data = train_data - np.tile(min_vals, (nums, 1))
normal_data = normal_data / np.tile(ranges, (nums, 1))
k = 3
max_iterations = 50
centroids, cluster = KMeans2.k_means(normal_data, k, max_iterations)
plt.figure(figsize=(12, 5), dpi=80)
plt.subplot(1, 2, 1)
for Iris_type in Iris_types:
plt.scatter(Iris_data[x_axis], Iris_data[y_axis], c='black')
plt.title('raw')
plt.subplot(1, 2, 2)
for centroid_id, centroid in enumerate(centroids):
current_examples_index = (cluster == centroid_id).flatten()
plt.scatter(normal_data[current_examples_index, 0], normal_data[current_examples_index, 1])
for centroid_id, centroid in enumerate(centroids):
plt.scatter(centroid[0], centroid[1], c='red', marker='x')
plt.title('label kemans')
plt.show()
(2)人造数据集(好)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import KMeans2
raw_data = pd.read_csv('./438-3.csv', header=None)
examples_num = raw_data.shape[0]
train_data = raw_data[[0, 1]].values.reshape(examples_num, 2)
min_vals = train_data.min(0)
max_vals = train_data.max(0)
ranges = max_vals - min_vals
normal_data = np.zeros(np.shape(train_data))
nums = train_data.shape[0]
normal_data = train_data - np.tile(min_vals, (nums, 1))
normal_data = normal_data / np.tile(ranges, (nums, 1))
print(normal_data)
k = 3
max_iterations = 50
centroids, cluster = KMeans2.k_means(normal_data, k, max_iterations)
plt.figure(figsize=(12, 5), dpi=80)
plt.subplot(1, 2, 1)
plt.scatter(normal_data[:, 0], normal_data[:, 1], c='black')
plt.title('raw')
plt.subplot(1, 2, 2)
for centroid_id, centroid in enumerate(centroids):
current_examples_index = (cluster == centroid_id).flatten()
plt.scatter(normal_data[current_examples_index, 0], normal_data[current_examples_index, 1])
for centroid_id, centroid in enumerate(centroids):
plt.scatter(centroid[0], centroid[1], c='red', marker='x')
plt.title('k-Means')
plt.show()
(3)Jain数据集(较差)
(4)melon数据集(好)
(5)Spril数据集(差)
(6)threeCircles数据集(差)
(7)Square数据集(好)
(8)lineblobs数据集(差)
(9)788points数据集(差)
- 代码同(2)
(10)gassian数据集(好)
(11)arrevation数据集(较差)
|