数据分析chapter2
提示:写完文章后,目录可以自动生成,如何生成可参考右边的帮助文档
一、数据清洗和特征处理
1.数据清洗
针对缺失、异常数据进行处理
import numpy as np
import pandas as pd
df = pd.read_csv('train.csv')
df.info()
df.isnull().sum()
df[['Age','Cabin','Embarked']].head(3)
df[df['Age']==None]=0
df[df['Age'].isnull()] = 0
df[df['Age'] == np.nan] = 0
df.dropna().head(3)
df.fillna(0).head(3)
df[df.duplicated()]
df = df.drop_duplicates()
df.to_csv('test_clear.csv')
示例:pandas 是基于NumPy 的一种工具,该工具是为了解决数据分析任务而创建的。
2.特征处理
把特征大概分为两大类: 数值型特征:Survived ,Pclass, Age ,SibSp, Parch, Fare,其中Survived, Pclass为离散型数值特征,Age,SibSp, Parch, Fare为连续型数值特征 文本型特征:Name, Sex, Cabin,Embarked, Ticket,其中Sex, Cabin, Embarked, Ticket为类别型文本特征。
数值型特征一般可以直接用于模型的训练,但有时候为了模型的稳定性及鲁棒性会对连续变量进行离散化。文本型特征往往需要转换成数值型特征才能用于建模分析。
df['AgeBand'] = pd.cut(df['Age'], 5,labels = [1,2,3,4,5])
df.head()
df.to_csv('test_ave.csv')
df['AgeBand'] = pd.cut(df['Age'],[0,5,15,30,50,80],labels = [1,2,3,4,5])
df['AgeBand'] = pd.qcut(df['Age'],[0,0.1,0.3,0.5,0.7,0.9],labels = [1,2,3,4,5])
df.to_csv('test_pr.csv')
df['Sex'].value_counts()
df['Cabin'].value_counts()
df['Embarked'].value_counts()
df['Sex'].unique()
df['Sex'].nunique()
df['Sex_num'] = df['Sex'].replace(['male','female'],[1,2])
df.head()
df['Sex_num'] = df['Sex'].map({'male': 1, 'female': 2})
df.head()
from sklearn.preprocessing import LabelEncoder
for feat in ['Cabin', 'Ticket']:
lbl = LabelEncoder()
label_dict = dict(zip(df[feat].unique(), range(df[feat].nunique())))
df[feat + "_labelEncode"] = df[feat].map(label_dict)
df[feat + "_labelEncode"] = lbl.fit_transform(df[feat].astype(str))
df.head()
for feat in ["Age", "Embarked"]:
x = pd.get_dummies(df[feat], prefix=feat)
df = pd.concat([df, x], axis=1)
df.head()
df['Title'] = df.Name.str.extract('([A-Za-z]+)\.', expand=False)
df.head()
df.to_csv('test_fin.csv')
二、数据重构
1.数据合并
代码如下(示例):
import numpy as np
import pandas as pd
# 载入data文件中的:train-left-up.csv
text = pd.read_csv('train-left-up.csv')
text_left_up = pd.read_csv("data/train-left-up.csv")
text_left_down = pd.read_csv("data/train-left-down.csv")
text_right_up = pd.read_csv("data/train-right-up.csv")
text_right_down = pd.read_csv("data/train-right-down.csv")
## 1.使用concat合并
list_up = [text_left_up,text_right_up]
result_up = pd.concat(list_up,axis=1)
list_down=[text_left_down,text_right_down]
result_down = pd.concat(list_down,axis=1)
result = pd.concat([result_up,result_down])
## 2.使用DataFrame合并
resul_up = text_left_up.join(text_right_up)
result_down = text_left_down.join(text_right_down)
result = result_up.append(result_down)
result.head()
## 3.使用Pandas合并
result_up = pd.merge(text_left_up,text_right_up,left_index=True,right_index=True)
result_down = pd.merge(text_left_down,text_right_down,left_index=True,right_index=True)
result = resul_up.append(result_down)
result.to_csv('result.csv')
<font color=#999AAA >代码如下(示例):
```c
data = pd.read_csv(
'https://labfile.oss.aliyuncs.com/courses/1283/adult.data.csv')
print(data.head())
2.另外一个角度看数据
text = pd.read_csv('result.csv')
text.head()
unit_result=text.stack().head(20)
unit_result.to_csv('unit_result.csv')
test = pd.read_csv('unit_result.csv')
数据聚合与运算
df = text['Fare'].groupby(text['Sex'])
means = df.mean()
survived_sex = text['Survived'].groupby(text['Sex']).sum()
survived_sex.head()
survived_pclass = text['Survived'].groupby(text['Pclass'])
survived_pclass.sum()
text.groupby('Sex').agg({'Fare': 'mean', 'Pclass': 'count'}).rename(columns=
{'Fare': 'mean_fare', 'Pclass': 'count_pclass'})
text.groupby(['Pclass','Age'])['Fare'].mean().head()
result = pd.merge(means,survived_sex,on='Sex')
result
result.to_csv('sex_fare_survived.csv')
survived_age = text['Survived'].groupby(text['Age']).sum()
survived_age.head()
survived_age[survived_age.values==survived_age.max()]
_sum = text['Survived'].sum()
print(_sum)
_sum = text['Survived'].sum()
print("sum of person:"+str(_sum))
precetn =survived_age.max()/_sum
print("最大存活率:"+str(precetn))
该处使用的url网络请求的数据。
三、数据可视化
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
text = pd.read_csv(r'result.csv')
text.head()
sex = text.groupby('Sex')['Survived'].sum()
sex.plot.bar()
plt.title('survived_count')
plt.show()
text.groupby(['Sex','Survived'])['Survived'].count().unstack().plot(kind='bar',stacked='True')
plt.title('survived_count')
plt.ylabel('count')
fare_sur = text.groupby(['Fare'])['Survived'].value_counts().sort_values(ascending=False)
fare_sur
fig = plt.figure(figsize=(20, 18))
fare_sur.plot(grid=True)
plt.legend()
plt.show()
fare_sur1 = text.groupby(['Fare'])['Survived'].value_counts()
fare_sur1
fig = plt.figure(figsize=(20, 18))
fare_sur1.plot(grid=True)
plt.legend()
plt.show()
pclass_sur = text.groupby(['Pclass'])['Survived'].value_counts()
pclass_sur
import seaborn as sns
sns.countplot(x="Pclass", hue="Survived", data=text)
facet = sns.FacetGrid(text, hue="Survived",aspect=3)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, text['Age'].max()))
facet.add_legend()
text.Age[text.Pclass == 1].plot(kind='kde')
text.Age[text.Pclass == 2].plot(kind='kde')
text.Age[text.Pclass == 3].plot(kind='kde')
plt.xlabel("age")
plt.legend((1,2,3),loc="best")
|