pyclustering 开源
from pyclustering.utils import read_sample
from pyclustering.utils.metric import distance_metric, type_metric
from pyclustering.samples.definitions import SIMPLE_SAMPLES
def my_manhattan(point1, point2):
dimension = len(point1)
result = 0.0
for i in range(dimension):
result += abs(point1[i] - point2[i]) * 0.1
return result
metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
distance = metric([2.0, 3.0], [1.0, 3.0])
# # Load list of points for cluster analysis.
sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
metric = distance_metric(type_metric.USER_DEFINED, func=my_manhattan)
# create K-Means algorithm with specific distance metric
start_centers = [[4.7, 5.9], [5.7, 6.5]];
kmeans_instance = kmeans(sample, start_centers, metric=metric)
# run cluster analysis and obtain results
kmeans_instance.process()
clusters = kmeans_instance.get_clusters()
points = [[0.25, 0.2], [2.5, 4.0]]
closest_clusters = kmeans_instance.predict(points)
print(clusters)
print(closest_clusters)
但kmeans自定义距离中存在无法使用某些距离, 核心原因在于kmeans的计算过程中,取平均的操作,不同的距离定义之下,平均的含义是不同的。
|