import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
一.小练习1
data = {
'gender':['男','男','male','male','female','女'],
'age':[17,np.nan,17,16,18,19],
'语文':['10o',89,32,23,23,23],
'数学':[23,'-',32,23,23,32],
'英语':[23,45,32,23,None,32],
}
df = pd.DataFrame(data = data)
df
| gender | age | 语文 | 数学 | 英语 |
---|
0 | 男 | 17.0 | 10o | 23 | 23.0 |
---|
1 | 男 | NaN | 89 | - | 45.0 |
---|
2 | male | 17.0 | 32 | 32 | 32.0 |
---|
3 | male | 16.0 | 23 | 23 | 23.0 |
---|
4 | female | 18.0 | 23 | 23 | NaN |
---|
5 | 女 | 19.0 | 23 | 32 | 32.0 |
---|
1.量化gender
df.gender = df.gender.map(lambda x:1 if x =='男'or x=='male' else 0)
df.gender
0 1
1 1
2 1
3 1
4 0
5 0
Name: gender, dtype: int64
df.gender = df.gender.apply(lambda x:1 if x =='男'or x=='male' else 0)
df.gender
df['gender'] = df[['gender']].applymap(lambda x:1 if x =='男'or x=='male' else 0)
df
df.gender = ((df['gender']=='male') | (df['gender']=='男'))*1
df.gender
2.填充空值
平均值填充年龄
df.age.fillna(df['age'].mean(),inplace=True)
成绩的异常值填充为 0
df['英语'].fillna(0,inplace=True)
3.将成绩中的异常值也替换为0
def replace_abnormal(item):
if type(item)==str:
if item.isdigit():
if(0 <= int(item) <=100):
return int(item)
else:
return 0
else:
return 0
else:
return item
df.iloc[:,2:] = df.iloc[:,2:].applymap(replace_abnormal)
df
| gender | age | 语文 | 数学 | 英语 |
---|
0 | 1 | 17.0 | 0 | 23 | 23.0 |
---|
1 | 1 | 17.4 | 89 | 0 | 45.0 |
---|
2 | 1 | 17.0 | 32 | 32 | 32.0 |
---|
3 | 1 | 16.0 | 23 | 23 | 23.0 |
---|
4 | 0 | 18.0 | 23 | 23 | 0.0 |
---|
5 | 0 | 19.0 | 23 | 32 | 32.0 |
---|
4.replace 练习
df.gender.replace({1:'男',0:'女'})
0 男
1 男
2 男
3 男
4 女
5 女
Name: gender, dtype: object
二.小练习2
1.导入数据
data1 = pd.read_csv('./day11复习/data1.txt',
na_values='null'
)
data1
| Cust_id | x1 | x2 | Max_ovd_days |
---|
0 | 1 | 28 | 36.0 | 10 |
---|
1 | 2 | 28 | 36.0 | 0 |
---|
2 | 1 | 28 | 36.0 | 10 |
---|
3 | 2 | 28 | 36.0 | 0 |
---|
4 | 3 | 84 | 8.0 | 40 |
---|
5 | 4 | 35 | 8.0 | 15 |
---|
6 | 5 | 25 | NaN | 0 |
---|
7 | 6 | 48 | 15.0 | 5 |
---|
data1.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Cust_id 8 non-null int64
1 x1 8 non-null int64
2 x2 7 non-null float64
3 Max_ovd_days 8 non-null int64
dtypes: float64(1), int64(3)
memory usage: 384.0 bytes
2.将x1,x2 中的缺失值替换为平均值
data1.x1.fillna(np.mean(data1.x1),inplace = True)
data1.x2.fillna(np.mean(data1.x2),inplace = True)
data1
| Cust_id | x1 | x2 | Max_ovd_days |
---|
0 | 1 | 28 | 36.0 | 10 |
---|
1 | 2 | 28 | 36.0 | 0 |
---|
2 | 1 | 28 | 36.0 | 10 |
---|
3 | 2 | 28 | 36.0 | 0 |
---|
4 | 3 | 84 | 8.0 | 40 |
---|
5 | 4 | 35 | 8.0 | 15 |
---|
6 | 5 | 25 | 25.0 | 0 |
---|
7 | 6 | 48 | 15.0 | 5 |
---|
3.生成y 逾期>=30 ->1 ; 其他 -> 0
def func(item):
if item>=30:
return 1
else:
return 0
data1['y'] = data1.Max_ovd_days.apply(func)
data1
| Cust_id | x1 | x2 | Max_ovd_days | y |
---|
0 | 1 | 28 | 36.0 | 10 | 0 |
---|
1 | 2 | 28 | 36.0 | 0 | 0 |
---|
2 | 1 | 28 | 36.0 | 10 | 0 |
---|
3 | 2 | 28 | 36.0 | 0 | 0 |
---|
4 | 3 | 84 | 8.0 | 40 | 1 |
---|
5 | 4 | 35 | 8.0 | 15 | 0 |
---|
6 | 5 | 25 | 25.0 | 0 | 0 |
---|
7 | 6 | 48 | 15.0 | 5 | 0 |
---|
4.划分数据集
from sklearn.model_selection import train_test_split
X,y = data1.iloc[:,:-1],data1.y
X_train,X_test,y_train,y_test = train_test_split(data1.iloc[:,1:-1],data1.y)
5.网格搜索获得最优参数建模
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')
param_grid = {'n_neighbors':[1,3,5,7,9,11,13,15]}
gscv = GridSearchCV(KNeighborsClassifier(),param_grid = param_grid,cv=3)
gscv.fit(X_train,y_train)
print(gscv.best_params_)
knn = KNeighborsClassifier(n_neighbors=1)
from sklearn.model_selection import cross_val_score
scores = cross_val_score(knn,X,y,cv=3)
scores.mean()
{'n_neighbors': 1}
0.8888888888888888
三:欠采样
def RandomUnderSample(x,y,seed,multiple1):
"""
x,y - 需要欠采样的数据集,必须是DataFrame
label - 列名
seed - 种子
multiple1 - 比例一般是1:1 如果换成1:5 填入5
"""
np.random.seed(seed)
label = y.columns[0]
number0 = len(y[y[label]==0])
number1 = len(y[y[label]==1])
if number0 > number1:
min_number = number1
else:
min_number = number0
indices_1 = np.array(y[y[label]==1].index)
indices_0 = np.array(y[y[label]==0].index)
if len(indices_1) > len(indices_0):
max_array = indices_1
min_array = indices_0
else:
max_array = indices_0
min_array = indices_1
random_1_indices= np.array(np.random.choice(max_array,min_number*multiple1))
"""
print "choice([1, 2, 3, 5, 9]) : ", random.choice([1, 2, 3, 5, 9])
"""
index = np.concatenate([min_array1,random_1_indices])
X_under_sample = x.loc[index,:]
Y_under_sample = y.loc[index,:]
return X_under_sample,Y_under_sample
四.导包
基础导入
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = 'Songti SC'
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
import warnings
warnings.filterwarnings('ignore')
机器学习方法总结
线性回归,线性分类
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.linear_model import LogisticRegression,SGDClassifier
KNN
from sklearn.neighbors import KNeighborsClassifier,KNeighborsRegressor
KMeans
from sklearn.cluster import KMeans
贝叶斯
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
决策树
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
支持向量机
from sklearn.svm import SVR,SVC
集成学习方法
from sklearn.ensemble import BaggingClassifier,BaggingRegressor
from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier,ExtraTreesRegressor
序列学习方法
from sklearn.ensemble import AdaBoostRegressor,AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor,GradientBoostingClassifier
from xgboost import XGBRegressor,XGBClassifier
特征选择
Filter 基于方差选择
from sklearn.feature_selection import VarianceThreshold
Wrapper
from sklearn.feature_selection import RFE
Embeded
基于惩罚项的特征选择法
from sklearn.feature_selection import SelectFromModel
基于树模型的特征选择法
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import GradientBoostingClassifier
区间缩放
from sklearn.preprocessing import MinMaxScaler
标准化
from sklearn.preprocessing import StandardScaler
归一化
from sklearn.preprocessing import Normalizer
对定量特征二值化
from sklearn.preprocessing import Binarizer
对定性特征哑编码
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
PCA
from sklearn.decomposition import PCA
LDA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
网格搜索
from sklearn.model_selection import GridSearchCV
交叉验证
from sklearn.model_selection import cross_val_score
集成学习
from sklearn.ensemble import VotingClassifier
回归器性能评估
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import explained_variance_score
from sklearn.metrics import r2_score
`
|