[人工智能] 数据挖掘（二）预测潜在贷款发放客户

开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> 数据挖掘（二）预测潜在贷款发放客户 -> 正文阅读

[人工智能]数据挖掘（二）预测潜在贷款发放客户

注：参考多篇csdn及b站文章所得

一、实验背景

某机构想要预测哪些客户可能会产生贷款违约行为｡他们搜集了历史客户行为的部分数据以及目标客户的信息,希望通过历史数据对目标客户进行预测哪些客户会是潜在的违约客户,从而缩小目标范围,实现低风险贷款发放｡

搜集到的数据以.CSV存储,分别包括历史客户和目标客户两个文件｡数据描述如下:

字段名	字段描述	数据类型
income	客户收入	int
age	客户年龄	int
experience	工作年限	int
profession	职业	string
married	婚否	string
house_ownership	有房/租房/其它	string
car_ownership	是否有车	string
risk_flag	是否拖欠贷款	string
currentjobyears	现有工作年限	int
currenthouseyears	在当前住所居住时长	int
city	居住城市	string
state	居住州/邦	string

二、实验内容

基于分类方法,根据客户历史行为预测潜在的贷款客户

三、实验步骤

1.导入数据

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')

#一、数据导入
app_train=pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/historic customer behavior.csv')
app_test =pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/target customer.csv')

2.数据探索性分析

#二、数据探索性分析
# 画图代码
def plot_stats(feature, label_rotation=False, horizontal_layout=True):
    temp = app_train[feature].value_counts()
    df1 = pd.DataFrame({feature: temp.index, 'Number of contracts': temp.values})
    # 计算每个属性类别中Risk_Flag=1的个数
    cat_perc = app_train[[feature, 'Risk_Flag']].groupby([feature], as_index=False).mean()
    cat_perc.sort_values(by='Risk_Flag', ascending=False, inplace=True)
    if (horizontal_layout):
        fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 6))
    else:
        fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12, 14))
    sns.set_color_codes("pastel")
    s = sns.barplot(ax=ax1, x=feature, y="Number of contracts", data=df1)
    if (label_rotation):
        s.set_xticklabels(s.get_xticklabels(), rotation=90)
    s = sns.barplot(ax=ax2, x=feature, y='Risk_Flag', order=cat_perc[feature], data=cat_perc)
    if (label_rotation):
        s.set_xticklabels(s.get_xticklabels(), rotation=90)
    plt.ylabel('Percent of Risk_Flag with value 1 [%]', fontsize=10)
    plt.tick_params(axis='both', which='major', labelsize=10)
    plt.gcf().subplots_adjust(left=0.05,top=0.91,bottom=0.09)
    plt.show();
def plot_distribution(var):
    i = 0
    t1 = app_train.loc[app_train['Risk_Flag'] != 0]
    t0 = app_train.loc[app_train['Risk_Flag'] == 0]
    sns.set_style('whitegrid')
    plt.figure()
    fig, ax = plt.subplots(2, 2, figsize=(6, 6))
    for feature in var:
        i += 1
        plt.subplot(2, 2, i)
        sns.kdeplot(t1[feature], bw=0.5, label="Risk_Flag = 1")
        sns.kdeplot(t0[feature], bw=0.5, label="Risk_Flag = 0")
        plt.ylabel('Density plot', fontsize=12)
        plt.xlabel(feature, fontsize=12)
        locs, labels = plt.xticks()
        plt.tick_params(axis='both', which='major', labelsize=12)
    plt.show();
plt.figure(figsize = (5, 6))

#1.年龄的影响
# 按时偿还贷款的KDE(kdeplot,核密度估计图)图
sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 0, 'Age'], label='Risk_Flag == 0')
# 没有按时偿还贷款的KDE(kdeplot,核密度估计图)图
sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 1, 'Age'], label='Risk_Flag == 1')
# 标签设置
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');
plt.gcf().subplots_adjust(left=0.05,right=0.91, top=0.9, bottom=0.09)
plt.show()
#年龄的影响
plot_stats('Age',False,False)
#2.收入的影响
plot_stats('Income',False,False)
#3.房车的影响
plot_stats('FLAG_OWN_CAR')
plot_stats('FLAG_OWN_REALTY')
#4.婚否的影响
plot_stats('Married/Single',True,?True)
#5.工作经验的影响
plot_stats('Experience',False,False)

1）用户年龄特征探索

数据表处理结果：

图2：年龄与违约行为是否发生的折线图

?图3：不同年龄段违约之人所占比例

分析：?

由图2可知,违约用户中20-30的年轻用户分布更多,所以可以假设用户年龄越小,违约的可能性越大;

由图3可知,对用户的年龄进行分捅,进一步观察不同年龄段用户的违约概率｡发现[20,25],[25,30]的用户违约的可能性最高,其余年龄段违约可能性相近;

?2）用户有无房否

分析：由图可知,没有房的人比有房的人违约率更高

3）?用户有无车否

分析：由图可知,没有车的人比有车的人违约率更高,但相差并不大

?4）用户婚否

分析：由图可知,未婚的人比已婚的违约率更高

5）用户收入

分析：由图可知看出收入较低的人违约几率较高

6）用户工作经验?

分析：由图可知工作时间较短的人违约几率较高

7）热力图

3.特征预处理

构建新特征DAYS_EMPLOYED_PERCENT: 用户工作年限experience/客户年龄

#三、特征预处理
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

#构造新特征
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()
#构建新特征
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['Experience'] / app_train_domain['Age']#用户工作时间/年龄
#app_train_domain['INCOME_HOUSE'] = app_train_domain['Income'] / app_train_domain['House_Ownership']#用户工作收入/房子拥有有情况
#app_train_domain['INCOME_CAR'] = app_train_domain['Income'] / app_train_domain['Car_Ownership']#用户工作收入/车子拥有有情况
plt.figure(figsize=(10, 10))
# 构造新特性的迭代器
for i, feature in enumerate(['DAYS_EMPLOYED_PERCENT']):
    # 创建子图
    plt.subplot(1, 1, i + 1)
    # 按期还款用户的KDE图
    sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 0, feature], label='Risk_Flag == 0')
    # plot loans that were not repaid
    sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 1, feature], label='Risk_Flag == 1')
    # 未按期还款用户的KDE图
    plt.title('Distribution of %s by Target Value' % feature)
    plt.xlabel('%s' % feature);
    plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)
plt.show()
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['Experience'] / app_test_domain['Age']

4.?建模分析

#四、建模
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc

#1. LGBMClassifier模型
def model(features, test_features, encoding='ohe', n_folds=5):
    # 提取id
    train_ids = features['Id']
    test_ids = test_features['ID']
    # 提起训练集中的id
    labels = features['Risk_Flag']
    # 删除训练集中的id和target
    features = features.drop(columns=['Id', 'Risk_Flag'])
    test_features = test_features.drop(columns=['ID'])
    # One Hot 编码
    if encoding == 'ohe':
        features = pd.get_dummies(features)
        test_features = pd.get_dummies(test_features)
        # 连接对齐训练集和测试集中的特征
        features, test_features = features.align(test_features, join='inner', axis=1)
        # 没有分类索引的记录
        cat_indices = 'auto'
    # I整数标签编码
    elif encoding == 'le':
        # 创建编码器
        label_encoder = LabelEncoder()
        # 创建列表，用于储存分类索引
        cat_indices = []
        # I按列迭代
        for i, col in enumerate(features):
            if features[col].dtype == 'object':
                # 将分类特征映射到整数
                features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
                test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))
                # 记录分类索引
                cat_indices.append(i)
    # 捕获错误，当标签编码方案无效时
    else:
        raise ValueError("Encoding must be either 'ohe' or 'le'")
    print('Training Data Shape: ', features.shape)
    print('Testing Data Shape: ', test_features.shape)
    # 提取训练集特征的名字
    feature_names = list(features.columns)
    # 转换为np数组
    features = np.array(features)
    test_features = np.array(test_features)
    # 创建K者交叉验证对象
    k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50)
    # 为重要特征设置空数组
    feature_importance_values = np.zeros(len(feature_names))
    # 创建测试预测的空数组
    test_predictions = np.zeros(test_features.shape[0])
    # 创建空数组，为了折叠验证预测
    out_of_fold = np.zeros(features.shape[0])
    # 创建list，保存验证和训练分数
    valid_scores = []
    train_scores = []
    # 创建K折验证迭代器
    for train_indices, valid_indices in k_fold.split(features):
        # K折训练数据
        train_features, train_labels = features[train_indices], labels[train_indices]
        # K折验证数据
        valid_features, valid_labels = features[valid_indices], labels[valid_indices]
   
# 创建LGBMClassifier模型
        model = lgb.LGBMClassifier(n_estimators=1000, objective='binary',
                                   class_weight='balanced', learning_rate=0.05,
                                   reg_alpha=0.1, reg_lambda=0.1,
                                   subsample=0.8, n_jobs=-1, random_state=50)
        # 训练模型
        model.fit(train_features, train_labels, eval_metric='auc',
                  eval_set=[(valid_features, valid_labels), (train_features, train_labels)],
                  eval_names=['valid', 'train'], categorical_feature=cat_indices,
                  early_stopping_rounds=100, verbose=200)
        # 在训练过程中使用了提前停止，使用best_iteration从最佳迭代中获取训练结果
        best_iteration = model.best_iteration_
        # 记录重要特征
        feature_importance_values += model.feature_importances_ / k_fold.n_splits
        # 预测
        test_predictions += model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits
        # 在验证集上预测
        out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration=best_iteration)[:, 1]
        # 记录最好的分数
        valid_score = model.best_score_['valid']['auc']
        train_score = model.best_score_['train']['auc']
        valid_scores.append(valid_score)
        train_scores.append(train_score)
        # 清楚所有变量
        gc.enable()
        del model, train_features, valid_features
        gc.collect()
    # 设置提交文件的数据框格式
    submission = pd.DataFrame({'ID': test_ids, 'Risk_Flag': test_predictions})
    # 把重要特性变成数据框格式
    feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
    # 整体验证评分
    valid_auc = roc_auc_score(labels, out_of_fold)
    # 将总分添加到指标中
    valid_scores.append(valid_auc)
    train_scores.append(np.mean(train_scores))
    # 创建验证分数的数据框格式
    fold_names = list(range(n_folds))
    fold_names.append('overall')
    # 验证分数的数据框格式
    metrics = pd.DataFrame({'fold': fold_names,
                            'train': train_scores,
                            'valid': valid_scores})
    return submission, feature_importances, metrics

#利用自定义函数训练模型，并输出5折交叉验证的结果
submission, fi, metrics = model(app_train_domain, app_test_domain)
print('Baseline metrics')
print(metrics)
submission.to_csv('my_submission1.csv',index=False)
del app_train_domain,app_test_domain
gc.collect

def plot_feature_importances(df):
    # 根据重要性对特征进行排序
    df = df.sort_values('importance', ascending=False).reset_index()
    # 将特性的重要性标准化，使其加起来等于1
    df['importance_normalized'] = df['importance'] / df['importance'].sum()
    # 做一个特征重要性的水平条形图
    plt.figure(figsize=(5, 3))
    ax = plt.subplot()
    # 最重要的特征放在最上面
    ax.barh(list(reversed(list(df.index[:15]))),
            df['importance_normalized'].head(15),
            align='center', edgecolor='k')
    # 设置y轴标签和刻度
    ax.set_yticks(list(reversed(list(df.index[:15]))))
    ax.set_yticklabels(df['feature'].head(15))
    # 设置x轴标签、标题
    plt.xlabel('Normalized Importance');
    plt.title('Feature Importances')
    plt.show()
    return df
fi_sorted = plot_feature_importances(fi)

#2. xgboost模型
# 定类数据编码
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(app_train.iloc[:,3:6])
enc.categories_

app_train['Married/Single'] = app_train['Married/Single'].map({'single': 0, 'married': 1})
app_train['House_Ownership'] = app_train['House_Ownership'].map({'norent_noown': 0, 'rented': 1 ,'owned': 2})
app_train['Car_Ownership'] = app_train['Car_Ownership'].map({'no': 0, 'yes': 1})
app_train

# 定量特征转化为标准正态分布
from sklearn.preprocessing import StandardScaler
#标准化，返回值为标准化后的数据
app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]] = StandardScaler().fit_transform(app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]])
app_train
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X = app_train[
            ["Income", "Age", "Experience", "Married/Single", "House_Ownership", "Car_Ownership", "CURRENT_JOB_YRS",
             "CURRENT_HOUSE_YRS"]]
y = app_train[["Risk_Flag"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test

# xgboost模型训练
from xgboost import XGBClassifier
print("---------------------xgboost forest---------------------")
xgbc = XGBClassifier(n_estimatores=180,nthread=-1,early_stopping_rounds=200,max_depth=10).fit(X_train, y_train)
print("训练集精度:")
result = xgbc.score(X_train,y_train)
print(result)
print("验证集精度:")
result = xgbc.score(X_test,y_test)
print(result)
# 绘制ROC曲线
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
# Compute ROC curve and ROC area for each class
y_score = xgbc.fit(X_train, y_train).predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(y_test, y_score[:,1]);
roc_auc = auc(fpr, tpr)
# 确定最佳阈值
right_index = (tpr + (1 - fpr) - 1)
yuzhi = max(right_index)
index = list(right_index).index(max(right_index))
tpr_val = tpr[index]
fpr_val = fpr[index]
# 绘制roc曲线图
plt.subplots(figsize=(7,5.5))
plt.plot(fpr, tpr, color='darkorange',lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show() 

 # 3. 随机森林分类器训练
        from sklearn.metrics import roc_auc_score
        from sklearn.ensemble import RandomForestClassifier
        print("---------------------random forest---------------------")
        rf = RandomForestClassifier(n_estimators=22, random_state=0, max_depth=20).fit(X_train, y_train)
        print("训练集精度:")
        result = rf.score(X_train , y_train)
        print(result)
        print("验证集精度:")
        result = rf.score(X_test, y_test)
        print(result)
        # 绘制ROC曲线
        from sklearn.metrics import roc_curve, auc
        from sklearn.model_selection import train_test_split

        # Compute ROC curve and ROC area for each class
        y_score = rf.fit(X_train, y_train).predict_proba(X_test)
        fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1]);
        roc_auc = auc(fpr, tpr)
        # 确定最佳阈值
        right_index = (tpr + (1 - fpr) - 1)
        yuzhi = max(right_index)
        index = list(right_index).index(max(right_index))
        tpr_val = tpr[index]
        fpr_val = fpr[index]
        # 绘制roc曲线图
        plt.subplots(figsize=(7, 5.5))
        plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curve')
        plt.legend(loc="lower right")
        plt.grid()
        plt.show()

1）利用LightGBM模型进行建模预测