数据信息
工业蒸馏数据
import库函数
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
数据读取
test_data_file = "./zhengqi_test.txt"
train_data_file = "./zhengqi_train.txt"
train_data = pd.read_csv(train_data_file, sep='\t', encoding='utf-8')
test_data = pd.read_csv(test_data_file, sep='\t', encoding='utf-8')
train_data.info()
train_data.describe()
箱线图
def box_map(Data):
"""画出Data的所有的特征标签对应的箱线图"""
plt.figure(figsize=(18, 10))
plt.boxplot(Data.values, labels=Data.columns)
plt.hlines([-7.5, 7.5], 0, 40, "red")
"""
箱线图另一种画法:
#取出每个特征的标签,以便画图
column = Data.columns.tolist()[:39]
# 指定绘图对象宽度和高度
fig = plt.figure(figsize=(20, 40))
for i in range(38):
# 一张画布画13行3列
plt.subplot(13, 3, i + 1)
sns.boxplot(Data[column[i]]# 数据
, orient="v"# “v”|“h” 用于控制图像使水平还是竖直显示
, width=0.5) # 箱式图
# 添加标签名称
plt.ylabel(column[i], fontsize=8)
"""
box_map(train_data)
箱线图解释连接
获取并删除异常值
from sklearn.metrics import mean_squared_error
def find_outliers(model, train_data, sigma=3):
ALLX = train_data.iloc[:, 0:-1]
ALLY = train_data.iloc[:, -1]
"""使用model来见到的预测每一个标签值,若差距过大,就删除该条数据"""
plt.figure(figsize=(15, 3 * 38))
for i, eachName in enumerate(ALLX.columns):
print("this is "+str(eachName)+" situation:")
_y = ALLX.loc[:, eachName]
_X = ALLX.drop(eachName, axis=1)
model.fit(_X, _y)
y_pred = pd.Series(model.predict(_X), index=_y.index)
from sklearn.metrics import r2_score
R2=r2_score(model.predict(_X),_y)
print("均方误差MSE:{}, R^2:{}".format(round(mean_squared_error(_y, y_pred), 4), round(R2, 4)))
resid = _y - y_pred
resid_mean = resid.mean()
resid_std = resid.std()
print("残差均值resid_mean:{}, 残差标准差resid_std:{}".format(round(resid_mean, 4), round(resid_std, 4)))
z = (resid - resid_mean) / resid_std
outliers = z[abs(z) > sigma].index
print("异常值索引outlier index:", outliers.tolist())
ax_1 = plt.subplot(38, 3, i * 3 + 1)
plt.plot(_y, y_pred, ".", label="Accepted")
plt.plot(_y.loc[outliers], y_pred[outliers], "ro", label="Outlier")
plt.ylabel("y_pred")
plt.xlabel("true_y of " + eachName)
plt.legend()
ax_2 = plt.subplot(38, 3, i * 3 + 2)
plt.plot(_y, _y - y_pred, ".", label="Accepted")
plt.plot(_y.loc[outliers], _y.loc[outliers] - y_pred.loc[outliers], "ro", label="Outlier")
plt.ylabel("residual")
plt.xlabel("true_y of " + eachName)
plt.legend()
ax_3 = plt.subplot(38, 3, i * 3 + 3)
ax_3.hist(z, bins=50, facecolor="blue")
ax_3.hist(z.loc[outliers], bins=50, facecolor="red")
plt.legend(["Accepted", "Outlier"])
plt.xlabel("distribution of " + eachName)
if R2 > 0.7:
ALLX = ALLX.drop(outliers)
ALLY = ALLY.drop(outliers)
plt.tight_layout()
data=pd.concat([ALLX,ALLY],axis=1)
data.index=list(range(data.shape[0]))
return data
from sklearn.linear_model import Ridge
train_data=find_outliers(Ridge(),train_data, sigma=3)
那么得到的数据在正态分布上就是一下两个图的组合,sigma就是图中的z 样本删除可视化后: 删除后的箱线图:虽然还有很多
是否符合正态分布
def prob_kde(train_data):
"""画出样本分布,和Q-Q图"""
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4 * train_cols, 4 * train_rows / 3))
i = 0
for col in train_data.columns[:-1]:
dat = train_data[[col, "target"]].dropna()
i += 1
ax = plt.subplot(train_rows / 3, train_cols, i)
sns.distplot(dat[col], fit=stats.norm)
plt.title("skew=" + "{:.4f}".format(stats.skew(dat[col])))
""" skewness = 0 : normally distributed.
skewness > 0 : more weight in the left tail of the distribution.
skewness < 0 : more weight in the right tail of the distribution. """
i += 1
ax = plt.subplot(train_rows / 3, train_cols, i)
res = stats.probplot(dat[col], plot=plt)
plt.title("corr=" + "{:.2f}".format(np.corrcoef(dat[col], dat["target"])[0][1]))
plt.tight_layout()
训练数据和测试数据的分布关系
def train_test_kde(train_data, test_data, columns):
dist_cols = 6
dist_rows = len(test_data.columns)
plt.figure(figsize=(4 * dist_cols, 4 * 7))
i = 1
for col in columns:
ax = plt.subplot(7, dist_cols, i)
sns.kdeplot(train_data[col], color="red", shade=True,label="train")
sns.kdeplot(test_data[col], color="blue", shade=True,label="test")
plt.xlabel(col)
plt.ylabel("Frequence")
plt.legend()
i += 1
plt.tight_layout()
train_test_kde(train_data,test_data,X_train.columns)
drop_columns=["V5","V9","V11","V17","V22","V28"]
train_data.drop(columns = drop_columns, inplace=True)
test_data.drop(columns = drop_columns, inplace=True)
训练和测试数据归一化正态化
数据分布情况:
def prob_kde(train_data):
"""画出样本分布,和Q-Q图"""
train_cols = 6
train_rows = len(train_data.columns)
plt.figure(figsize=(4 * train_cols, 4 * train_rows / 3))
i = 0
for col in train_data.columns[:-1]:
dat = train_data[[col, "target"]].dropna()
i += 1
ax = plt.subplot(train_rows / 3, train_cols, i)
sns.distplot(dat[col], fit=stats.norm)
plt.title("skew=" + "{:.4f}".format(stats.skew(dat[col])))
""" skewness = 0 : normally distributed.
skewness > 0 : more weight in the left tail of the distribution.
skewness < 0 : more weight in the right tail of the distribution. """
i += 1
ax = plt.subplot(train_rows / 3, train_cols, i)
res = stats.probplot(dat[col], plot=plt)
plt.title("corr=" + "{:.2f}".format(np.corrcoef(dat[col], dat["target"])[0][1]))
plt.tight_layout()
prob_kde(train_data)
from sklearn.preprocessing import MinMaxScaler
def func_mms(train,test):
cols_numeric = test.columns
train_data_process = pd.DataFrame(MinMaxScaler().fit_transform(train[cols_numeric]), columns=cols_numeric)
test_data_process = pd.DataFrame(MinMaxScaler().fit_transform(test[cols_numeric]), columns=cols_numeric)
return pd.concat([train_data_process, train_data["target"]], axis=1),test_data_process
train_data,test_data=func_mms(train_data,test_data)
stats.boxcox正态化:
for var in test_data.columns:
train_data[var], lambda_var = stats.boxcox(train_data[var].dropna() + 1)
test_data[var], lambda_var = stats.boxcox(test_data[var].dropna() + 1)
prob_kde(train_data)
多重共线性
通过热力图查看
train_corr = train_data.corr()
plt.figure(figsize=(20, 16))
sns.heatmap(train_corr, vmax=0.8, square=True, annot=True)
颜色越浅共线性越强:
nlargest_f = train_corr.nlargest(10, columns="target")["target"]
cols = nlargest_f.index
plt.figure(figsize=(10, 10))
sns.heatmap(train_data[cols].corr(), annot=True, square=True)
通过多重共线性方差膨胀因子查看
from statsmodels.stats.outliers_influence import variance_inflation_factor
cols=train_data.columns
X=np.matrix(train_data[cols])
VIF_list=[variance_inflation_factor(X, i) for i in range(X.shape[1])]
方差膨胀因子的解释 PCA解决多重线性问题: 不过这里效果不好,没使用
|