机器学习——房屋价格预测【回归问题】
1. 导工具包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
2. 读取数据
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
3. EDA探索分析
train.shape
test.shape
train.info()
train.describe()
train.isnull().sum().sort_values(ascending=False)
train.isnull().sum().sort_values(ascending=False) / train.shape[0]
test.isnull().sum().sort_values(ascending=False) / test.shape[0]
train.drop(columns=['PoolQC','MiscFeature','Alley','Fence'], axis=1, inplace=True)
test.drop(columns=['PoolQC','MiscFeature','Alley','Fence'], axis=1, inplace=True)
number_columns = [ col for col in train.columns if train[col].dtype != 'object']
category_columns = [col for col in train.columns if train[col].dtype == 'object']
fig, axes = plt.subplots(nrows=13, ncols=3, figsize=(20, 18))
axes = axes.flatten()
for i, col in zip(range(len(number_columns)), number_columns):
sns.distplot(train[col], ax=axes[i])
plt.tight_layout()
plt.figure(figsize=(16, 8))
plt.title("YearBuilt vs SalePrice")
sns.scatterplot(train.YearBuilt, train.SalePrice)
plt.show()
plt.figure(figsize=(16, 8))
sns.scatterplot(x='1stFlrSF', y='SalePrice', data=train)
plt.show()
fig, axes = plt.subplots(13, 3, figsize=(25, 20))
axes = axes.flatten()
for i, col in enumerate(category_columns):
sns.stripplot(x=col, y='SalePrice', data=train, ax=axes[i])
plt.tight_layout()
plt.show()
4. Feature Engineering 特征工程
train_nan_num = []
train_nan_cat = []
for col in number_columns:
if train[col].isnull().sum() > 0:
train_nan_num.append(col)
for col in category_columns:
if train[col].isnull().sum() > 0:
train_nan_cat.append(col)
test_nan_num = []
test_nan_cat = []
number_columns.remove('SalePrice')
for col in number_columns:
if test[col].isnull().sum() > 0:
test_nan_num.append(col)
for col in category_columns:
if test[col].isnull().sum() > 0:
test_nan_cat.append(col)
5. 针对 空缺值 的处理方式
方案一:简单粗暴:直接删除
train_one = train.dropna(axis=0)
test_one = test.dropna(axis=0)
print(train_one.shape)
print(test_one.shape)
方案二:折中法:对于数值类型列,取中位数;对于分类类型列,取None;
for col in train_nan_num:
train[col].fillna(train[col].median(), inplace=True)
for col in train_nan_cat:
train[col].fillna('None', inplace=True)
for col in test_nan_num:
test[col].fillna(test[col].median(), inplace=True)
for col in test_nan_cat:
test[col].fillna('None', inplace=True)
6. 算法建模、训练、验证
数据集分类
from sklearn.preprocessing import LabelEncoder
LE = LabelEncoder()
for col in category_columns:
train[col] = LE.fit_transform(train[col])
test[col] = LE.fit_transform(test[col])
X = train.drop(columns=['Id', 'SalePrice'], axis=1).values
y = train['SalePrice'].values
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True)
创建回归模型
方案一:线性回归+随机训练集与测试集
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
'''
MSE: Mean Squared Error
均方误差是指参数估计值与参数真值之差平方的期望值;
MSE可以评价数据的变化程度,MSE的值越小,说明预测模型描述实验数据具有更好的精确度。
'''
LR = LinearRegression()
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
print(f'Root Mean Squared Error : {np.sqrt(mean_absolute_error(np.log(y_test), np.log(y_pred)))}')
方案二:线性回归+K折交叉验证
from sklearn.model_selection import KFold
kf = KFold(n_splits=10)
rmse_scores = []
for train_indices, test_indices in kf.split(X):
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
LR = LinearRegression(normalize=True)
LR.fit(X_train, y_train)
y_pred = LR.predict(X_test)
rmse = np.sqrt(mean_absolute_error(np.log(y_test), np.log(abs(y_pred))))
rmse_scores.append(rmse)
print("rmse scores : ", rmse_scores)
print(f'average rmse score : {np.mean(rmse_scores)}')
方案三:随机森林+K折交叉验证
from sklearn.ensemble import RandomForestRegressor
kf = KFold(n_splits=10)
rmse_scores = []
for train_indices, test_indices in kf.split(X):
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
RFR = RandomForestRegressor()
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)
rmse = mean_absolute_error(y_test, y_pred)
rmse_scores.append(rmse)
print("rmse scores : ", rmse_scores)
print(f'average rmse scores : {np.mean(rmse_scores)}')
方案四:LightGBM+K折交叉验证
# 3 lightGBM(回归)
import lightgbm as lgb
# K折交叉验证
kf = KFold(n_splits=10)
rmse_scores = []
for train_indices, test_indices in kf.split(X):
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
# 初始化模型
LGBR = lgb.LGBMRegressor() # 基模型
# 训练/fit拟合
LGBR.fit(X_train, y_train)
# 预测
y_pred = LGBR.predict(X_test)
# 评估
rmse = mean_absolute_error(y_test, y_pred)
# 累计结果
rmse_scores.append(rmse)
print("rmse scores : ", rmse_scores)
print(f'average rmse scores : {np.mean(rmse_scores)}')
方案四:xgboost+K折交叉验证
import xgboost as xgb
kf = KFold(n_splits=10)
rmse_scores = []
for train_indices, test_indices in kf.split(X):
X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]
XGBR = xgb.XGBRegressor()
XGBR.fit(X_train, y_train)
y_pred = XGBR.predict(X_test)
rmse = mean_absolute_error(y_test, y_pred)
rmse_scores.append(rmse)
print("rmse scores : ", rmse_scores)
print(f'average rmse scores : {np.mean(rmse_scores)}')
7. 模型预测
LGBR.fit(X, y)
test_pred = LGBR.predict(test.drop('Id',axis=1).values)
result_df = pd.DataFrame(columns=['SalePrice'])
result_df['SalePrice'] = test_pred
result_df.to_csv('LGBR_base_model.csv', index=None, header=True)
result_df['SalePrice'].plot(figsize=(16,8))
8. LightGBM算法调参
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)
params = {
'objective':'regression',
'metric':'rmse',
'learning_rate':0.1,
'max_depth':15,
'num_leaves':20,
}
model = lgb.train(params=params,
train_set=train_data,
num_boost_round=300,
early_stopping_rounds=30,
valid_names=['test'],
valid_sets=[test_data])
score = model.best_score['test']['rmse']
test_pred = model.predict(test.drop('Id',axis=1).values)
result_df2 = pd.DataFrame(columns=['SalePrice'])
result_df2['SalePrice'] = test_pred
result_df2.to_csv('LGBR_model2.csv', index=None, header=True)
result_df2['SalePrice'].plot(figsize=(16,8))
实例源代码链接
|