核心思想: #1.随机生成指定个数质心点,聚类的数量 #2.质心点不变,更新类别,计算每个点与质心点的距离,计算出每个点距离哪个质心点最近,类别设置为哪个质心点类别 #3.类别不变,更新质心点,所有点按质心点类别分组,没类别求出所有点的特征值的均值,质心点更新 #4.新质心点执行第2步,循环调优,直到SSE不在变小 #5.搭建好模型后,质心点个数从1到20根据模型求出每种SSE的值,画图得到最优质心点数量
from sklearn.datasets import make_blobs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
x,y = make_blobs(n_samples=800,n_features=2,centers=5,random_state=1)
data = pd.DataFrame(x)
data['label'] = y
plt.scatter(x[:,0],x[:,1])
SSE_total_list = []
for i in range(1,11):
clusters = KMeans(n_clusters= i)
clusters.fit(x)
SSE_total_list.append(clusters.inertia_)
plt.figure(figsize=(5,4),dpi=100,facecolor='lightgrey')
pd.Series(SSE_total_list,index=range(1,11)).plot()
"""
fit后出现的重要数值
clusters.inertia_#SSE,总距离
clusters.labels_#y分组,类别,每个点所属哪个质心点
clusters.cluster_centers_ #质心点数据
"""
from sklearn.datasets import make_blobs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
def initial_centers(datasets,k):
"""随机生成点"""
data = datasets.drop('label',axis=1)
info = data.describe().loc[['min','max'],:]
l =[np.random.uniform(info[i]['min'],info[i]['max'],k) for i in data.columns]
centers = pd.DataFrame(l,index=data.columns).T
return centers
def get_distance(datasets,centers):
"""距离计算"""
data = datasets.drop('label',axis=1)
d_list =[(np.power(data-centers.loc[i,:],2).sum(axis=1)) for i in centers.index]
return pd.concat(d_list,axis=1)
def iterate(datasets,centers):
"""更新质心点,跟新类别"""
d_centers = get_distance(datasets,centers)
current_grup = d_centers.idxmin(axis=1)
SSE=d_centers.min(axis=1).sum()
data = datasets.drop('label',axis=1)
centers = data.groupby(current_grup).mean()
return current_grup,centers,SSE
def Kmeans_regular(datasets,k):
"""主函数入口"""
SSE_list = [0]
centers = initial_centers(datasets,k)
while True:
current_grup,centers,SSE=iterate(datasets,centers)
if SSE == SSE_list[-1]:
break
SSE_list.append(SSE)
return current_grup,SSE_list,centers
-------------------------------------------测试----------------------------------------
x,y = make_blobs(n_samples=500,n_features=2,centers=8,random_state=1)
data = pd.DataFrame(x)
data['label'] = y
current_grup,SSE_list,centers=Kmeans_regular(data,4)
|