导入包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
导入数据
data = pd.read_excel(r'F:\桌面\论文831电力预测\所有数据\电力预测数据缺失值.xlsx')
df1 = pd.DataFrame(data.isnull().sum())
df1
target = data['电力消费总量']
features = data.iloc[:,1:-1]
X_full, y_full = features,target
n_samples = X_full.shape[0]
n_features = X_full.shape[1]
print(n_samples)
print(n_features)
61
12
X_missing_reg = X_full.copy()
missing = X_missing_reg .isna().sum()
missing = pd.DataFrame(data={'特征': missing.index,'缺失值个数':missing.values})
missing = missing[~missing['缺失值个数'].isin([0])]
missing['缺失比例'] = missing['缺失值个数']/X_missing_reg .shape[0]
missing
| 特征 | 缺失值个数 | 缺失比例 |
---|
4 | 总人口 | 1 | 0.016393 |
---|
5 | 第一产业增加值 | 1 | 0.016393 |
---|
X_df = X_missing_reg.isnull().sum()
colname= X_df[~X_df.isin([0])].sort_values().index.values
sortindex =[]
for i in colname:
sortindex .append(X_missing_reg.columns.tolist().index(str(i)))
for i in sortindex:
df = X_missing_reg
fillc = df.iloc[:,i]
df = pd.concat([df.drop(df.columns[i],axis=1),pd.DataFrame(y_full)],axis=1)
df_0 =SimpleImputer(missing_values=np.nan,strategy='constant',fill_value=0).fit_transform(df)
Ytrain = fillc[fillc.notnull()]
Ytest = fillc[fillc.isnull()]
Xtrain = df_0[Ytrain.index,:]
Xtest = df_0[Ytest.index,:]
rfc = RandomForestRegressor(n_estimators=100)
rfc = rfc.fit(Xtrain, Ytrain)
Ypredict = rfc.predict(Xtest)
X_missing_reg.loc[X_missing_reg.iloc[:,i].isnull(),X_missing_reg.columns[i]] = Ypredict
missing2 = X_missing_reg.isna().sum()
missing2 = pd.DataFrame(data={'列名': missing2.index,'缺失值个数':missing2.values})
missing2[~missing2['缺失值个数'].isin([0])]
X_missing_reg
X_missing_reg.to_excel('随机深林填补电力完整数据.xlsx')
|