一、项目介绍
本项目以某互联网贷款网站提供的贷款人的个人信息为背景,根据历史数据,建立模型,预测新的一个人来了,是否给他贷款,以实现利润最大化。 网站地址:http://lendingclub.com 数据:链接:https://pan.baidu.com/s/1NoU9oGnRS70d8663SQPKVQ 提取码:oldg
二、数据清洗过滤无用特征
import pandas as pd
loans2007=pd.read_csv(r"E:\BaiduNetdiskDownload\唐宇迪机器学习\贷款利润最大化\LoanStats3a.csv",engine="python",skiprows=1)
half=len(loans2007)/2
loans2007=loans2007.dropna(thresh=half,axis=1)
loans2007= loans2007.drop(['desc', 'url'],axis=1)
loans2007.to_csv("E:\BaiduNetdiskDownload\唐宇迪机器学习\贷款利润最大化\loans2007.csv",index=False)
loans2007=pd.read_csv("E:\BaiduNetdiskDownload\唐宇迪机器学习\贷款利润最大化\loans2007.csv",engine="python")
print(loans2007.iloc[0])
print(loans2007.shape[1])
loans2007 = loans2007.drop(["id", "member_id", "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
loans2007 = loans2007.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
loans2007 = loans2007.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
print(loans2007["loan_status"].value_counts())
loans2007=loans2007[(loans2007["loan_status"]=="Fully Paid" )|( loans2007["loan_status"]=="Charged Off")]
status_replace={
"
":{
"Fully Paid":1,
"Charged Off":0
}
}
loans2007.replace(status_replace)
columns=loans2007.columns
drop_columns=[]
for col in columns:
col_num=loans2007[col].dropna().unique()
if len(col_num)== 1:
drop_columns.append(col)
loans2007=loans2007.drop(drop_columns,axis=1)
print(drop_columns)
print(loans2007.shape)
loans2007.to_csv(r"E:\BaiduNetdiskDownload\唐宇迪机器学习\贷款利润最大化\filtered_loans2007.csv")
三、数据预处理
import pandas as pd
loans=pd.read_csv(r"E:\BaiduNetdiskDownload\唐宇迪机器学习\贷款利润最大化\filtered_loans2007.csv",engine="python")
null_counts=loans.isnull().sum()
print(null_counts)
print(loans["emp_length"].isnull().sum())
loans = loans.dropna(axis=0)
print(loans.dtypes.value_counts())
object_columns=loans.select_dtypes(include=["object"])
print(object_columns.iloc[0])
print(loans["purpose"].value_counts())
print(loans["title"].value_counts())
print(loans["emp_length"].unique())
mapping_dict={
"emp_length":{
'10+ years':10,
'< 1 year':0,
'3 years':3,
'8 years' :8,
'9 years' :9,
'4 years' :4,
'5 years':5,
'1 year' :1,
'6 years':6,
'2 years' :2,
'7 years':7
}
}
loans=loans.drop(["last_credit_pull_d","earliest_cr_line","addr_state","title"],axis=1)
loans["int_rate"]=loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"]=loans["revol_util"].str.rstrip("%").astype("float")
loans=loans.replace(mapping_dict)
cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
loans = loans.drop("pymnt_plan", axis=1)
loans.to_csv('cleaned_loans2007.csv', index=False)
四、获得最大利润的条件与做法
import pandas as pd
loans=pd.read_csv("cleaned_loans2007.csv")
print(loans.info())
cols=loans.columns
train_col=cols.drop("loan_status")
features=loans[train_col]
target=loans["loan_status"]
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_predict,KFold
lr=LinearRegression()
kf=KFold(5,shuffle=False,random_state=1)
predictions=cross_val_predict(lr,features,target,cv=kf)
predictions=pd.Series(predictions)
(未完待续)
|