仅仅记录一下简单的特征分析法:
数据集仍是:Pima印第安人糖尿病数据集皮马印第安人糖尿病数据集免费下载分享-机器学习文档类资源-CSDN下载
主要是:数据分布、散点图、相关性矩阵分析等
代码如下:
from operator import index
from numpy import loadtxt
from numpy import sort
from matplotlib import pyplot
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
# #----------------------Load Data-----------------------------------------------
dataset = loadtxt(r'C:\Users\Administrator\Desktop\pima-indians-diabetes.csv', delimiter=",",skiprows=1)
# #--------------------- array转化为DataFrame,并添加列名# -----------------------------------
data = np.array(dataset)
df = pd.DataFrame(data=data[0:,0:],
columns=['pregnants','Plasma_glucose_concentration','blood_pressure','Triceps_skin_fold_thickness','serum_insulin','BMI','Diabetes_pedigree_function','Age','Target'
] )
# #--------------------数据集描述性分析-----------------------------------------
df.describe()
# #---------------------检查缺失值情况-----------------------------------------
df_missing_count=df.isnull().sum()
plt.rcParams['figure.figsize']=(15,8)
df_missing_count.plot.bar()
# #---------------------绘制数据的分布情况-----------------------------------------
df.hist(figsize=(16,14))
# #---------------------绘制数据的密度情况-----------------------------------------
df.plot(kind='density',subplots=True,layout=(3,4),sharex=False)
# #----------------------散点图分析-----------------------------------------------
sns.pairplot(df,hue = "Target")
# #----------------------Correlation computer-------------------------------------
figure, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(df.corr(), square=True, annot=True, ax=ax)
plt.show()
最后结果:
?
|