import xlwt
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from collections import OrderedDict
from sklearn.cluster import KMeans
from collections import OrderedDict
from sklearn.svm import SVC
from scipy.special import expit
from copy import deepcopy
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import rbf_kernel
from KBS_NEW.PointwiseQuery.ALOR import ALOR
from sklearn.metrics import accuracy_score, mean_squared_error
from time import time
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils.validation import check_X_y
p = Path("D:\OCdata")
# names_list = ["cleveland","HDI2","toy","glass","balance-scale","car","automobile",
# "thyroid","thyroid2","newthyroid","baseball"
# "machine-5bin","machine-10bin","Obesity1","Obesity2",
# "housing-5bin","housing-10bin","ARWU2020-5bin","ARWU2020-10bin",
# "QSR2020-5bin","QSR2020-10bin","stock-5bin", "stock-10bin",
# "bank-5bin","bank-10bin","computer-5bin","computer-10bin",
# "abalone-5bin","abalone-10bin","SWD","winequality-red",
# "PowerPlant-10bin","penbased","optdigits","Computer2-10bin","bank2-10bin","eucalyptus"]
names_list = ["toy"]
for name in names_list:
path = p.joinpath(name + ".csv")
print("########################{}".format(path))
data = np.array(pd.read_csv(path, header=None))
X = np.asarray(data[:, :-1], np.float64)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data[:, -1]
y -= y.min()
nClass = len(np.unique(y))
workbook = xlwt.Workbook()
Rounds = 5
count = 0
for r in range(Rounds):
SKF = StratifiedKFold(n_splits=5, shuffle=True)
for train_idx, test_idx in SKF.split(X, y):
count += 1
X_train = X[train_idx]
y_train = y[train_idx].astype(np.int32)
labeled = []
label_dict = OrderedDict()
for lab in np.unique(y_train):
label_dict[lab] = []
for idx in range(len(y_train)):
label_dict[y_train[idx]].append(idx)
for idxlist in label_dict.values():
for jdx in np.random.choice(idxlist, size=1, replace=False):
labeled.append(jdx)
SheetNames = "{}".format(count)
sheet = workbook.add_sheet(SheetNames)
for i, idx in enumerate(train_idx):
sheet.write(i, 0, int(idx))
for i, idx in enumerate(test_idx):
sheet.write(i, 1, int(idx))
for i, idx in enumerate(labeled):
sheet.write(i, 2, int(idx))
column = 2
for k in range(nClass,10*nClass+1):
kmeans = KMeans(n_clusters=k)
kmeans.fit(X=X_train)
# --------------------------------------
column += 1
for j, jdx in enumerate(kmeans.labels_):
sheet.write(j,column,int(jdx))
# -----calculate the cluster center-----
center = np.zeros(k)
for lab in range(k):
tmp_center = kmeans.cluster_centers_[lab]
lab_ids = train_idx[np.where(kmeans.labels_==lab)[0]]
min_dist = np.inf
for idx in lab_ids:
dist = np.linalg.norm(X[idx] - tmp_center)
if dist <= min_dist:
min_dist = dist
center[lab] = idx
# ------------------------------------
column += 1
for j, jdx in enumerate(center):
sheet.write(j,column,int(jdx))
# ------------------------------------
save_path = Path(r"E:\CCCCC_Result\DataPartitions")
save_path = str(save_path.joinpath(name + ".xls"))
workbook.save(save_path)
read and test
import pandas as pd
import numpy as np
import xlrd
import xlwt
from pathlib import Path
from sklearn.model_selection import StratifiedKFold
from collections import OrderedDict
from sklearn.svm import SVC
from scipy.special import expit
from copy import deepcopy
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics.pairwise import pairwise_kernels
from sklearn.base import ClassifierMixin, BaseEstimator
from sklearn.utils.validation import check_X_y
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, f1_score
name = "toy"
data_path = Path(r"D:\OCdata")
partition_path = Path(r"E:\CCCCC_Result\DataPartitions")
kmeans_path = Path(r"E:\CCCCC_Result\KmeansResult")
read_data_path = data_path.joinpath(name + ".csv")
data = np.array(pd.read_csv(read_data_path, header=None))
X = np.asarray(data[:, :-1], np.float64)
scaler = StandardScaler()
X = scaler.fit_transform(X)
y = data[:, -1]
y -= y.min()
read_partition_path = str(partition_path.joinpath(name + ".xls"))
book_partition = xlrd.open_workbook(read_partition_path)
read_kmeans_path = str(kmeans_path.joinpath(name + ".xls"))
book_kmeans = xlrd.open_workbook(read_kmeans_path)
for SN in book_partition.sheet_names():
train_idx = []
labels = []
center = []
table_partition = book_partition.sheet_by_name(SN)
table_kmeans = book_kmeans.sheet_by_name(SN)
for idx in table_partition.col_values(0):
if isinstance(idx,float):
train_idx.append(int(idx))
for idx in table_kmeans.col_values(16):
if isinstance(idx,float):
labels.append(int(idx))
for idx in table_kmeans.col_values(17):
if isinstance(idx,float):
center.append(int(idx))
print(len(train_idx))
print(len(labels))
print(len(center))
print(center)
plt.scatter(X[train_idx,0],X[train_idx,1],c=labels)
for idx in center:
plt.scatter(X[idx,0],X[idx,1],c='r',marker="*",s=120)
plt.show()
break
|