第一章 数据载入及初步观察
1.1 载入数据
import numpy as np
import pandas as pd
import os
df=pd.read_csv('train.csv')
df.shape
(891, 12)
path = os.path.abspath('train.csv')
df = pd.read_csv(path)
df.head(3)
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
os.getcwd()
'/clusters/data_1080Ti_0/USER_DATA/shanghonglin/shl/hands-on-data-analysis/ one'
df1 = pd.read_table(path)
df2 = pd.read_table(path, sep=',')
df2
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
---|
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
---|
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
---|
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
---|
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
---|
891 rows × 12 columns
df=pd.read_csv('train.csv',chunksize=500)
type(df),type(pd.read_csv('train.csv'))
(pandas.io.parsers.readers.TextFileReader, pandas.core.frame.DataFrame)
df.get_chunk()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
495 | 496 | 0 | 3 | Yousseff, Mr. Gerious | male | NaN | 0 | 0 | 2627 | 14.4583 | NaN | C |
---|
496 | 497 | 1 | 1 | Eustis, Miss. Elizabeth Mussey | female | 54.0 | 1 | 0 | 36947 | 78.2667 | D20 | C |
---|
497 | 498 | 0 | 3 | Shellard, Mr. Frederick William | male | NaN | 0 | 0 | C.A. 6212 | 15.1000 | NaN | S |
---|
498 | 499 | 0 | 1 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S |
---|
499 | 500 | 0 | 3 | Svensson, Mr. Olof | male | 24.0 | 0 | 0 | 350035 | 7.7958 | NaN | S |
---|
500 rows × 12 columns
for piece in df:
print(piece)
print(len(piece))
df= pd.read_csv('train.csv')
df.columns=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口']
df
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S |
---|
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S |
---|
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S |
---|
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C |
---|
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q |
---|
891 rows × 12 columns
df1=pd.read_csv('train.csv', names=['乘客ID','是否幸存','仓位等级','姓名','性别','年龄','兄弟姐妹个数','父母子女个数','船票信息','票价','客舱','登船港口'],index_col='乘客ID',header=0)
df1.head()
| 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
乘客ID | | | | | | | | | | | |
---|
1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 乘客ID 891 non-null int64
1 是否幸存 891 non-null int64
2 仓位等级 891 non-null int64
3 姓名 891 non-null object
4 性别 891 non-null object
5 年龄 714 non-null float64
6 兄弟姐妹个数 891 non-null int64
7 父母子女个数 891 non-null int64
8 船票信息 891 non-null object
9 票价 891 non-null float64
10 客舱 204 non-null object
11 登船港口 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB
df.head(10)
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
5 | 6 | 0 | 3 | Moran, Mr. James | male | NaN | 0 | 0 | 330877 | 8.4583 | NaN | Q |
---|
6 | 7 | 0 | 1 | McCarthy, Mr. Timothy J | male | 54.0 | 0 | 0 | 17463 | 51.8625 | E46 | S |
---|
7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.0 | 3 | 1 | 349909 | 21.0750 | NaN | S |
---|
8 | 9 | 1 | 3 | Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) | female | 27.0 | 0 | 2 | 347742 | 11.1333 | NaN | S |
---|
9 | 10 | 1 | 2 | Nasser, Mrs. Nicholas (Adele Achem) | female | 14.0 | 1 | 0 | 237736 | 30.0708 | NaN | C |
---|
df.tail(5)
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.00 | NaN | S |
---|
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | S |
---|
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
---|
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
---|
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
---|
df.isnull().head()
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
0 | False | False | False | False | False | False | False | False | False | False | True | False |
---|
1 | False | False | False | False | False | False | False | False | False | False | False | False |
---|
2 | False | False | False | False | False | False | False | False | False | False | True | False |
---|
3 | False | False | False | False | False | False | False | False | False | False | False | False |
---|
4 | False | False | False | False | False | False | False | False | False | False | True | False |
---|
df.to_csv('shltask1_1.csv')
复习:数据分析的第一步,加载数据我们已经学习完毕了。当数据展现在我们面前的时候,我们所要做的第一步就是认识他,今天我们要学习的就是了解字段含义以及初步观察数据。
1.4 知道你的数据叫什么
我们学习pandas的基础操作,那么上一节通过pandas加载之后的数据,其数据类型是什么呢?
开始前导入numpy和pandas
import numpy as np
import pandas as pd
1.4.1 任务一:pandas中有两个数据类型DateFrame和Series,通过查找简单了解他们。然后自己写一个关于这两个数据类型的小例子🌰[开放题]
s=pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
a = pd.Series([1,2,5,4,3], index=['a', 'b', 'c', 'd', 'e'])
s,a
(a -0.215950
b 0.356634
c -1.022319
d 0.730578
e 0.469027
dtype: float64,
a 1
b 2
c 5
d 4
e 3
dtype: int64)
d= {'b': 1, 'a': 0, 'c': 2}
s=pd.Series(d)
s
b 1
a 0
c 2
dtype: int64
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df
| one | two |
---|
a | 1.0 | 1.0 |
---|
b | 2.0 | 2.0 |
---|
c | 3.0 | 3.0 |
---|
d | NaN | 4.0 |
---|
'''
#我们举的例子
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000}
example_1 = pd.Series(sdata)
example_1
'''
'''
#我们举的例子
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
'year': [2000, 2001, 2002, 2001, 2002, 2003],'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
example_2 = pd.DataFrame(data)
example_2
'''
1.4.2 任务二:根据上节课的方法载入"train.csv"文件
df = pd.read_csv('train.csv')
df.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
也可以加载上一节课保存的"train_chinese.csv"文件。通过翻译版train_chinese.csv熟悉了这个数据集,然后我们对trian.csv来进行操作
1.4.3 任务三:查看DataFrame数据的每列的名称
df.columns
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
dtype='object')
1.4.4任务四:查看"Cabin"这列的所有值[有多种方法]
df.Cabin
0 NaN
1 C85
2 NaN
3 C123
4 NaN
...
886 NaN
887 B42
888 NaN
889 C148
890 NaN
Name: Cabin, Length: 891, dtype: object
df['Cabin']
0 NaN
1 C85
2 NaN
3 C123
4 NaN
...
886 NaN
887 B42
888 NaN
889 C148
890 NaN
Name: Cabin, Length: 891, dtype: object
type(df.Cabin)
pandas.core.series.Series
df[['Cabin']]
| Cabin |
---|
0 | NaN |
---|
1 | C85 |
---|
2 | NaN |
---|
3 | C123 |
---|
4 | NaN |
---|
... | ... |
---|
886 | NaN |
---|
887 | B42 |
---|
888 | NaN |
---|
889 | C148 |
---|
890 | NaN |
---|
891 rows × 1 columns
1.4.5 任务五:加载文件"test_1.csv",然后对比"train.csv",看看有哪些多出的列,然后将多出的列删除
经过我们的观察发现一个测试集test_1.csv有一列是多余的,我们需要将这个多余的列删去
test1=pd.read_csv('test_1.csv')
test1
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | a |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | 100 |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | 100 |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | 100 |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | 100 |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | 100 |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
886 | 886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | 100 |
---|
887 | 887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | 100 |
---|
888 | 888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | 100 |
---|
889 | 889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | 100 |
---|
890 | 890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | 100 |
---|
891 rows × 14 columns
df.head()
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
test1.head()
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
【思考】还有其他的删除多余的列的方式吗?
1.4.6 任务六: 将[‘PassengerId’,‘Name’,‘Age’,‘Ticket’]这几个列元素隐藏,只观察其他几个列元素
test1.drop(['PassengerId','Name','Age','Ticket'],axis=1).head(3)
| Unnamed: 0 | Survived | Pclass | Sex | SibSp | Parch | Fare | Cabin | Embarked |
---|
0 | 0 | 0 | 3 | male | 1 | 0 | 7.2500 | NaN | S |
---|
1 | 1 | 1 | 1 | female | 1 | 0 | 71.2833 | C85 | C |
---|
2 | 2 | 1 | 3 | female | 0 | 0 | 7.9250 | NaN | S |
---|
【思考】对比任务五和任务六,是不是使用了不一样的方法(函数),如果使用一样的函数如何完成上面的不同的要求呢?
【思考回答】
如果想要完全的删除你的数据结构,使用inplace=True,因为使用inplace就将原数据覆盖了,所以这里没有用
1.5 筛选的逻辑
表格数据中,最重要的一个功能就是要具有可筛选的能力,选出我所需要的信息,丢弃无用的信息。
下面我们还是用实战来学习pandas这个功能。
1.5.1 任务一: 我们以"Age"为筛选条件,显示年龄在10岁以下的乘客信息。
test1[test1['Age']< 10]
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
7 | 7 | 8 | 0 | 3 | Palsson, Master. Gosta Leonard | male | 2.00 | 3 | 1 | 349909 | 21.0750 | NaN | S |
---|
10 | 10 | 11 | 1 | 3 | Sandstrom, Miss. Marguerite Rut | female | 4.00 | 1 | 1 | PP 9549 | 16.7000 | G6 | S |
---|
16 | 16 | 17 | 0 | 3 | Rice, Master. Eugene | male | 2.00 | 4 | 1 | 382652 | 29.1250 | NaN | Q |
---|
24 | 24 | 25 | 0 | 3 | Palsson, Miss. Torborg Danira | female | 8.00 | 3 | 1 | 349909 | 21.0750 | NaN | S |
---|
43 | 43 | 44 | 1 | 2 | Laroche, Miss. Simonne Marie Anne Andree | female | 3.00 | 1 | 2 | SC/Paris 2123 | 41.5792 | NaN | C |
---|
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
---|
827 | 827 | 828 | 1 | 2 | Mallet, Master. Andre | male | 1.00 | 0 | 2 | S.C./PARIS 2079 | 37.0042 | NaN | C |
---|
831 | 831 | 832 | 1 | 2 | Richards, Master. George Sibley | male | 0.83 | 1 | 1 | 29106 | 18.7500 | NaN | S |
---|
850 | 850 | 851 | 0 | 3 | Andersson, Master. Sigvard Harald Elias | male | 4.00 | 4 | 2 | 347082 | 31.2750 | NaN | S |
---|
852 | 852 | 853 | 0 | 3 | Boulos, Miss. Nourelain | female | 9.00 | 1 | 1 | 2678 | 15.2458 | NaN | C |
---|
869 | 869 | 870 | 1 | 3 | Johnson, Master. Harold Theodor | male | 4.00 | 1 | 1 | 347742 | 11.1333 | NaN | S |
---|
62 rows × 13 columns
1.5.2 任务二: 以"Age"为条件,将年龄在10岁以上和50岁以下的乘客信息显示出来,并将这个数据命名为midage
midage = test1[(test1["Age"]>10)& (test1["Age"]<50)]
midage.head(3)
| Unnamed: 0 | PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
---|
0 | 0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
midage1 = test1[(test1["Age"]>10)| (test1["Age"]<50)]
midage1.shape
(714, 13)
midage.shape
(576, 13)
【提示】了解pandas的条件筛选方式以及如何使用交集和并集操作
1.5.3 任务三:将midage的数据中第100行的"Pclass"和"Sex"的数据显示出来
midage1 = midage.reset_index(drop=True)
midage1.loc[[100],['Pclass','Name','Sex']]
| Pclass | Name | Sex |
---|
100 | 2 | Byles, Rev. Thomas Roussel Davids | male |
---|
midage.to_csv('midage.csv')
【提示】在抽取数据中,我们希望数据的相对顺序保持不变,用什么函数可以达到这个效果呢?
1.5.4 任务四:使用loc方法将midage的数据中第100,105,108行的"Pclass","Name"和"Sex"的数据显示出来
midage.loc[[100,105,108],['Pclass','Name','Sex']]
| Pclass | Name | Sex |
---|
100 | 3 | Petranec, Miss. Matilda | female |
---|
105 | 3 | Mionoff, Mr. Stoytcho | male |
---|
108 | 3 | Rekic, Mr. Tido | male |
---|
1.5.5 任务五:使用iloc方法将midage的数据中第100,105,108行的"Pclass","Name"和"Sex"的数据显示出来
midage.iloc[[100,105,108],[2,3,4]]
| Survived | Pclass | Name |
---|
149 | 0 | 2 | Byles, Rev. Thomas Roussel Davids |
---|
160 | 0 | 3 | Cribb, Mr. John Hatfield |
---|
163 | 0 | 3 | Calic, Mr. Jovo |
---|
【思考】对比iloc 和loc 的异同 复习:在前面我们已经学习了Pandas基础,知道利用Pandas读取csv数据的增删查改,今天我们要学习的就是探索性数据分析,主要介绍如何利用Pandas进行排序、算术计算以及计算描述函数describe()的使用。
1 第一章:探索性数据分析
开始之前,导入numpy、pandas包和数据
import numpy as np
import pandas as pd
text= pd.read_csv('train_chinese.csv')
text.head()
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
---|
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
---|
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
---|
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
---|
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
---|
1.6 了解你的数据吗?
教材《Python for Data Analysis》第五章
1.6.1 任务一:利用Pandas对示例数据进行排序,要求升序
'''
我们举了一个例子
pd.DataFrame() :创建一个DataFrame对象
np.arange(8).reshape((2, 4)) : 生成一个二维数组(2*4),第一列:0,1,2,3 第二列:4,5,6,7
index=[2,1] :DataFrame 对象的索引列
columns=['d', 'a', 'b', 'c'] :DataFrame 对象的索引行
'''
frame=pd.DataFrame(np.arange(8).reshape(2,4),
index=['2', '1'],
columns=['d', 'a', 'b', 'c'])
frame
【代码解析】
pd.DataFrame() :创建一个DataFrame对象
np.arange(8).reshape((2, 4)) : 生成一个二维数组(2*4),第一列:0,1,2,3 第二列:4,5,6,7
index=['2, 1] :DataFrame 对象的索引列
columns=[‘d’, ‘a’, ‘b’, ‘c’] :DataFrame 对象的索引行
【问题】:大多数时候我们都是想根据列的值来排序,所以将你构建的DataFrame中的数据根据某一列,升序排列
frame.sort_values('b')
【思考】通过书本你能说出Pandas对DataFrame数据的其他排序方式吗?
【总结】下面将不同的排序方式做一个总结
1.让行索引升序排序
frame.sort_index()
2.让列索引升序排序
frame.sort_index(axis=1)
3.让列索引降序排序
frame.sort_index(axis=1, ascending=False)
4.让任选两列数据同时降序排序
frame.sort_values(by=['a', 'c'], ascending=False)
1.6.2 任务二:对泰坦尼克号数据(trian.csv)按票价和年龄两列进行综合排序(降序排列),从这个数据中你可以分析出什么?
'''
在开始我们已经导入了train_chinese.csv数据,而且前面我们也学习了导入数据过程,根据上面学习,我们直接对目标列进行排序即可
head(20) : 读取前20条数据
'''
text.sort_values(by=['票价', '年龄'], ascending=False).head()
| 乘客ID | 是否幸存 | 仓位等级 | 姓名 | 性别 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 船票信息 | 票价 | 客舱 | 登船港口 |
---|
679 | 680 | 1 | 1 | Cardeza, Mr. Thomas Drake Martinez | male | 36.0 | 0 | 1 | PC 17755 | 512.3292 | B51 B53 B55 | C |
---|
258 | 259 | 1 | 1 | Ward, Miss. Anna | female | 35.0 | 0 | 0 | PC 17755 | 512.3292 | NaN | C |
---|
737 | 738 | 1 | 1 | Lesurer, Mr. Gustave J | male | 35.0 | 0 | 0 | PC 17755 | 512.3292 | B101 | C |
---|
438 | 439 | 0 | 1 | Fortune, Mr. Mark | male | 64.0 | 1 | 4 | 19950 | 263.0000 | C23 C25 C27 | S |
---|
341 | 342 | 1 | 1 | Fortune, Miss. Alice Elizabeth | female | 24.0 | 3 | 2 | 19950 | 263.0000 | C23 C25 C27 | S |
---|
【思考】排序后,如果我们仅仅关注年龄和票价两列。根据常识我知道发现票价越高的应该客舱越好,所以我们会明显看出,票价前20的乘客中存活的有14人,这是相当高的一个比例,那么我们后面是不是可以进一步分析一下票价和存活之间的关系,年龄和存活之间的关系呢?当你开始发现数据之间的关系了,数据分析就开始了。
当然,这只是我的想法,你还可以有更多想法,欢迎写在你的学习笔记中。
多做几个数据的排序
1.6.3 任务三:利用Pandas进行算术计算,计算两个DataFrame数据相加结果
"""
我们举了一个例子:
frame1_a = pd.DataFrame(np.arange(9.).reshape(3, 3),
columns=['a', 'b', 'c'],
index=['one', 'two', 'three'])
frame1_b = pd.DataFrame(np.arange(12.).reshape(4, 3),
columns=['a', 'e', 'c'],
index=['first', 'one', 'two', 'second'])
frame1_a
"""
frame1_a = pd.DataFrame(np.arange(9.).reshape(3, 3),
columns=['a', 'b', 'c'],
index=['one', 'two', 'three'])
frame1_b = pd.DataFrame(np.arange(12.).reshape(4, 3),
columns=['a', 'e', 'c'],
index=['first', 'one', 'two', 'second'])
frame1_a,frame1_b
( a b c
one 0.0 1.0 2.0
two 3.0 4.0 5.0
three 6.0 7.0 8.0,
a e c
first 0.0 1.0 2.0
one 3.0 4.0 5.0
two 6.0 7.0 8.0
second 9.0 10.0 11.0)
将frame_a和frame_b进行相加
frame1_a + frame1_b
| a | b | c | e |
---|
first | NaN | NaN | NaN | NaN |
---|
one | 3.0 | NaN | 7.0 | NaN |
---|
second | NaN | NaN | NaN | NaN |
---|
three | NaN | NaN | NaN | NaN |
---|
two | 9.0 | NaN | 13.0 | NaN |
---|
【提醒】两个DataFrame相加后,会返回一个新的DataFrame,对应的行和列的值会相加,没有对应的会变成空值NaN。
当然,DataFrame还有很多算术运算,如减法,除法等,有兴趣的同学可以看《利用Python进行数据分析》第五章 算术运算与数据对齐 部分,多在网络上查找相关学习资料。
1.6.4 任务四:通过泰坦尼克号数据如何计算出在船上最大的家族有多少人?
'''
还是用之前导入的chinese_train.csv如果我们想看看在船上,最大的家族有多少人(‘兄弟姐妹个数’+‘父母子女个数’),我们该怎么做呢?
'''
text['兄弟姐妹个数'] + text['父母子女个数']
max(text['兄弟姐妹个数'] + text['父母子女个数'])
10
【提醒】我们只需找出”兄弟姐妹个数“和”父母子女个数“之和最大的数,当然你还可以想出很多方法和思考角度,欢迎你来说出你的看法。
多做几个数据的相加,看看你能分析出什么?
1.6.5 任务五:学会使用Pandas describe()函数查看数据基本统计信息
"""
我们举了一个例子:
frame2 = pd.DataFrame([[1.4, np.nan],
[7.1, -4.5],
[np.nan, np.nan],
[0.75, -1.3]
], index=['a', 'b', 'c', 'd'], columns=['one', 'two'])
frame2
"""
调用 describe 函数,观察frame2的数据基本信息
text.describe()
| 乘客ID | 是否幸存 | 仓位等级 | 年龄 | 兄弟姐妹个数 | 父母子女个数 | 票价 |
---|
count | 891.000000 | 891.000000 | 891.000000 | 714.000000 | 891.000000 | 891.000000 | 891.000000 |
---|
mean | 446.000000 | 0.383838 | 2.308642 | 29.699118 | 0.523008 | 0.381594 | 32.204208 |
---|
std | 257.353842 | 0.486592 | 0.836071 | 14.526497 | 1.102743 | 0.806057 | 49.693429 |
---|
min | 1.000000 | 0.000000 | 1.000000 | 0.420000 | 0.000000 | 0.000000 | 0.000000 |
---|
25% | 223.500000 | 0.000000 | 2.000000 | 20.125000 | 0.000000 | 0.000000 | 7.910400 |
---|
50% | 446.000000 | 0.000000 | 3.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 |
---|
75% | 668.500000 | 1.000000 | 3.000000 | 38.000000 | 1.000000 | 0.000000 | 31.000000 |
---|
max | 891.000000 | 1.000000 | 3.000000 | 80.000000 | 8.000000 | 6.000000 | 512.329200 |
---|
from matplotlib import pyplot as plt
plt.hist(text['年龄'])
(array([ 54., 46., 177., 169., 118., 70., 45., 24., 9., 2.]),
array([ 0.42 , 8.378, 16.336, 24.294, 32.252, 40.21 , 48.168, 56.126,
64.084, 72.042, 80. ]),
<BarContainer object of 10 artists>)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-zBi4cxFg-1639494543264)(output_46_1.png)]
1.6.6 任务六:分别看看泰坦尼克号数据集中 票价、父母子女 这列数据的基本统计数据,你能发现什么?
'''
看看泰坦尼克号数据集中 票价 这列数据的基本统计数据
'''
text['票价'].describe()
count 891.000000
mean 32.204208
std 49.693429
min 0.000000
25% 7.910400
50% 14.454200
75% 31.000000
max 512.329200
Name: 票价, dtype: float64
plt.hist(text['票价'])
(array([732., 106., 31., 2., 11., 6., 0., 0., 0., 3.]),
array([ 0. , 51.23292, 102.46584, 153.69876, 204.93168, 256.1646 ,
307.39752, 358.63044, 409.86336, 461.09628, 512.3292 ]),
<BarContainer object of 10 artists>)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-UC0LA5Jw-1639494543269)(output_50_1.png)]
【思考】从上面数据我们可以看出,试试在下面写出你的看法。然后看看我们给出的答案。
当然,答案只是我的想法,你还可以有更多想法,欢迎写在你的学习笔记中。
text['父母子女个数'].describe()
count 891.000000
mean 0.381594
std 0.806057
min 0.000000
25% 0.000000
50% 0.000000
75% 0.000000
max 6.000000
Name: 父母子女个数, dtype: float64
多做几个组数据的统计,看看你能分析出什么?
【思考】有更多想法,欢迎写在你的学习笔记中。
【总结】本节中我们通过Pandas的一些内置函数对数据进行了初步统计查看,这个过程最重要的不是大家得掌握这些函数,而是看懂从这些函数出来的数据,构建自己的数据分析思维,这也是第一章最重要的点,希望大家学完第一章能对数据有个基本认识,了解自己在做什么,为什么这么做,后面的章节我们将开始对数据进行清洗,进一步分析。
|