一、噪音过滤
降维的目的之一是希望抛弃对模型带来负面影响的特征,同时,带有效信息的特征的方差应该是远大于噪音的,所以相比噪音,有效的特征所带来的信息不会在PCA当中大量抛弃。inverse_transform 能够在不恢复原始数据的情况下,将降维后的数据返回到原本的高维空间。(即能够实现:“保证维度,但是去除方差很小的特征所带来的信息”)。我们可以利用·inverse_transform 来实现噪音的过滤。
寻找确定关键特征是降噪的前提,只保留关键特征,其他都可以看做是噪音。
-
重要参数:n_components,svd_solver,random_state -
三个重要属性:components_,explained_variance_以及explained_variance_ratio_ -
接口:fit,transform,fit_transform以及inverse_transform
1、案例:手写图像识别加噪与降噪
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
digits = load_digits()
digits.data.shape
set(digits.target.tolist())
def plot_digits(data):
fig, axes = plt.subplots(4,10,figsize=(10,4)
,subplot_kw = {"xticks":[],"yticks":[]}
)
for i, ax in enumerate(axes.flat):
ax.imshow(data[i].reshape(8,8),cmap="binary")
plot_digits(digits.data)
np.random.RandomState(42)
noisy = np.random.normal(digits.data,2)
plot_digits(noisy)
为手写数据加上噪音之后的图片:
pca = PCA(0.5).fit(noisy)
X_dr = pca.transform(noisy)
X_dr.shape
without_noise = pca.inverse_transform(X_dr)
plot_digits(without_noise)
plt.show()
使用inverse_transform 降噪后: 原来没有噪声的时候的维度:
without_noise.shape
2、案例:手写图像识别寻找最佳维度
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
data = pd.read_csv(r"digit recognizor.csv")
X = data.iloc[:,1:]
y = data.iloc[:,0]
X.shape
pca_line = PCA().fit(X)
plt.figure(figsize=[20,5])
plt.plot(np.cumsum(pca_line.explained_variance_ratio_))
plt.xlabel("number of components after dimension reduction")
plt.ylabel("cumulative explained variance ratio")
plt.show()
score = []
for i in range(1,101,10):
X_dr = PCA(i).fit_transform(X)
once = cross_val_score(RFC(n_estimators=10,random_state=0)
,X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(1,101,10),score)
plt.show()
score = []
for i in range(10,25):
X_dr = PCA(i).fit_transform(X)
once = cross_val_score(RFC(n_estimators=10,random_state=0),X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(10,25),score)
plt.show()
X_dr = PCA(21).fit_transform(X)
cross_val_score(RFC(n_estimators=100,random_state=0),X_dr,y,cv=5).mean()
from sklearn.neighbors import KNeighborsClassifier as KNN
cross_val_score(KNN(),X_dr,y,cv=5).mean()
score = []
for i in range(10):
X_dr = PCA(23).fit_transform(X)
once = cross_val_score(KNN(i+1),X_dr,y,cv=5).mean()
score.append(once)
plt.figure(figsize=[20,5])
plt.plot(range(10),score)
plt.show()
cross_val_score(KNN(4),X_dr,y,cv=5).mean()
3、模拟PCA过程
import numpy as np
import cv2 as cv
def Z_centered(dataMat):
rows, cols = dataMat.shape
meanVal = np.mean(dataMat, axis=0)
meanVal = np.tile(meanVal, (rows, 1))
newdata = dataMat - meanVal
return newdata, meanVal
def Percentage2n(eigVals, percentage):
sortArray = np.sort(eigVals)
sortArray = sortArray[-1::-1]
arraySum = sum(sortArray)
tmpSum = 0
num = 0
for i in sortArray:
tmpSum += i
num += 1
if tmpSum >= arraySum * percentage:
return num
def EigDV(covMat, p):
D, V = np.linalg.eig(covMat)
k = Percentage2n(D, p)
print("保留99%信息,降维后的特征个数:" + str(k) + "\n")
eigenvalue = np.argsort(D)
K_eigenValue = eigenvalue[-1:-(k + 1):-1]
K_eigenVector = V[:, K_eigenValue]
return K_eigenValue, K_eigenVector
def getlowDataMat(DataMat, K_eigenVector):
return DataMat * K_eigenVector
def Reconstruction(lowDataMat, K_eigenVector, meanVal):
reconDataMat = lowDataMat * K_eigenVector.T + meanVal
return reconDataMat
def PCA(data, p):
dataMat = np.float32(np.mat(data))
dataMat, meanVal = Z_centered(dataMat)
covMat = np.cov(dataMat, rowvar=0)
D, V = EigDV(covMat, p)
lowDataMat = getlowDataMat(dataMat, V)
reconDataMat = Reconstruction(lowDataMat, V, meanVal)
return reconDataMat
def main():
imagePath = '96014.jpg'
image = cv.imread(imagePath)
image = cv.cvtColor(image, cv.COLOR_BGR2GRAY)
rows, cols = image.shape
print("降维前的特征个数:" + str(cols) + "\n")
print(image)
print('----------------------------------------')
reconImage = PCA(image, 0.6)
reconImage = reconImage.astype(np.uint8)
print(reconImage)
cv.imshow('test', reconImage)
cv.waitKey(0)
cv.destroyAllWindows()
if __name__ == '__main__':
main()
4、模拟SVD过程
def svd(img, topk_percent=0.1):
"""
使用svd对图片降维,可作为一种数据增强手段
每列作为一个向量,先构建方阵,再求特征值 特征向量,取前N个主成分,再重构图像
:param img: 输入图像
:param topk_percent: 图像恢复率,
:return: img after svd
"""
img_src = img[...]
if len(img.shape) == 3:
img_src = cv2.cvtColor(img_src, cv2.COLOR_BGR2GRAY)
h, w = img_src.shape
data = np.asarray(img_src, np.double)
U, s, V = np.linalg.svd(data)
K = round(len(s) * topk_percent)
S = np.diag(s)
major_data = np.dot(U[:, :K], np.dot(S[:K, :K], V[:K, :]))
rebuild_img = np.asarray(major_data, np.uint8)
cv2.imshow('1', rebuild_img)
cv2.waitKey(0)
return rebuild_img
def pca(img, topk_percent=0.1):
"""
使用pca对图片降维,可作为一种数据增强手段
每列作为一个向量,先0均值化,再求协方差矩阵的特征值和特征向量,取前N个主成分,再重构图像
:param img: 输入图像
:param topk_percent: 图像恢复率,
:return: img after pca
"""
img_src = img[...]
if len(img.shape) == 3:
img_src = cv2.cvtColor(img_src, cv2.COLOR_BGR2GRAY)
print(img_src.shape)
h, w = img_src.shape
data = np.asarray(img_src, np.double)
_mean = np.mean(data, axis=0)
data -= _mean
data_cov = np.cov(data, rowvar=False)
feat_values, feat_vectors = np.linalg.eig(data_cov)
feat_index = np.argsort(np.sqrt(feat_values), axis=0)[::-1]
V = feat_vectors[:, feat_index]
K = round(len(feat_values) * topk_percent)
major_data = np.dot(np.dot(data, V[:, :K]), V[:, :K].T) + _mean
rebuild_img = np.asarray(major_data, np.uint8)
cv2.imshow('0', rebuild_img)
cv2.waitKey(0)
return rebuild_img
|