import os
import imageio
import numpy as np
def export_corr(out_path, coe: np.ndarray, feature_name):
"""
导出相关性矩阵为CSV文件
:param out_path: 文件名
:param coe: 2d相关性矩阵
:param feature_name: 1d列表 参数名
:return:
"""
f = open(out_path, 'w')
f.write('\t'.join(['correlation'] + list(feature_name)) + '\n')
for i in range(len(feature_name)):
f.write('\t'.join([feature_name[i]] + [str(j) for j in coe[i]]) + '\n')
f.close()
def feature_select_first(coe: np.ndarray, coe_limit=0.5):
"""
优先保留排位靠前的参数
:param coe: 2d相关性矩阵
:param coe_limit: 相关性阈值
:return: 剩余参数索引
"""
coe_mask = np.abs(coe) - np.eye(len(file_name)) > coe_limit
indices = []
for i in range(len(file_name)):
if not coe_mask[i, indices].any():
indices.append(i)
return indices
def feature_select_more(coe: np.ndarray, coe_limit=0.5, is_del_more_first=True):
"""
优先剔除相关数量较多/较少的参数
:param coe: 2d相关性矩阵
:param coe_limit: 相关性阈值
:param is_del_more_first: 优先剔除相关多还是少的参数, True:多的, False:少的 (推荐True, False会导致相近特征全部删除)
:return: 剩余参数索引
"""
coe_mask = np.abs(coe) - np.eye(len(file_name)) > coe_limit
indices = []
while True:
cor_num = [coe_mask[i].sum() for i in range(len(file_name))]
if not is_del_more_first:
cor_num = [i+len(file_name) if i == 0 else i for i in cor_num]
index = np.argmax(cor_num) if is_del_more_first else np.argmin(cor_num)
indices.append(index)
coe_mask[index, :] = False
coe_mask[:, index] = False
if not coe_mask.any():
break
indices = [i for i in range(len(file_name)) if i not in indices]
return indices
path = [r'D:\data\0.25\a',
r'D:\data\0.25\b',
r'D:\data\0.25\c',
]
file_name = np.concatenate([[i[:-4] for i in os.listdir(p) if i.endswith('.tif')] for p in path])
file_path = np.concatenate([[os.path.join(p, i) for i in os.listdir(p) if i.endswith('.tif')] for p in path])
img_array = np.array([imageio.imread(i) for i in file_path])
img_array = img_array.reshape(img_array.shape[0], -1)
nan_file = np.isnan(img_array).any(axis=1)
nan_mask = np.isnan(img_array).any(axis=0)
if nan_file.any():
print('Warning: NaN data in:')
for i in np.where(nan_file)[0]:
print(file_name[i])
coe = np.corrcoef(img_array[:, ~nan_mask])
if np.isnan(coe).any():
raise Exception('Error: NaN data in correlation matrix')
export_corr(r'D:\corr.csv', coe, file_name)
coe_limit = 0.5
indices = feature_select_more(coe, coe_limit=coe_limit)
print('-'*80)
print('Select: %i Features with coe_limit=%f\n' % (len(indices), coe_limit))
print('Feature:', file_name[indices])
print('-'*80)
|