分析:先标准化在进行随机森林的填充,避免随机填充时填充的数据太大影响标准化
先分析一下特征间数据差异过大,过大需要进行归一化(一般都要)
import pandas as pd
data=pd.read_csv(r'C:\Users\44933\Desktop\数据挖掘材料\全部数据\makenomal_flag.csv',encoding='utf-8',index_col=None)
df=data.median().T
df.columns=['zhibiao','lianggang']
x1=df.iloc[:,0:1]
y1=df.iloc[:,1:2]
plt.scatter(x1,y1)
plt.show()
import pandas as pd
import numpy as np
x=pd.read_csv(r'C:\Users\44933\Desktop\数据挖掘材料\全部数据\makenomal_flag.csv',encoding='utf-8')
X=x.iloc[:,2:]
Y=x.iloc[:,0:2]
def MaxMinNormalization(x):
"""[0,1] normaliaztion"""
x = (x - np.min(x)) / (np.max(x) - np.min(x))
return x
std=pd.concat([Y,MaxMinNormalization(X)],axis=1)
std.to_csv(r'C:\Users\44933\Desktop\数据挖掘材料\1\makestd_flag.csv',index=False)
|