1-1 初识pandas
创建序列
s = pd.Series([1, 3, 6, np.nan, 44, 1])
print(s)
"""
0 1.0
1 3.0
2 6.0
3 NaN
4 44.0
5 1.0
dtype: float64
"""
创建日期型数据,并增长六个日期
date = pd.date_range('2016-01-01', periods = 6)
print(date)
"""
DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
'2016-01-05', '2016-01-06'],
dtype='datetime64[ns]', freq='D')
"""
创建有索引和字符串的数据
df = pd.DataFrame(np.random.rand(6,4), index=date, columns=['a', 'b', 'c', 'd'])
print(df)
"""
a b c d
2016-01-01 0.113951 0.583000 0.167336 0.917897
2016-01-02 0.632843 0.950597 0.280311 0.946806
2016-01-03 0.367501 0.313236 0.475095 0.889570
2016-01-04 0.653676 0.444720 0.091550 0.272699
2016-01-05 0.448919 0.328602 0.644945 0.196358
2016-01-06 0.656723 0.355628 0.886951 0.688788
"""
不加索引的矩阵
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
df2 = pd.DataFrame({'A':1,
'B':pd.Timestamp('20120202'),
'C':pd.Series(1, index=list(range(4)), dtype='float32'),
'D':np.array([3]*4, dtype='int32'),
'E':pd.Categorical(["test", "train", "test", "train"]),
'F':'foo'
})
print(df2)
"""
A B C D E F
0 1 2012-02-02 1.0 3 test foo
1 1 2012-02-02 1.0 3 train foo
2 1 2012-02-02 1.0 3 test foo
3 1 2012-02-02 1.0 3 train foo
"""
print(df2.dtypes)
"""
A int64
B datetime64[ns]
C float32
D int32
E category
F object
dtype: object
"""
print(df2.columns)
"Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')"
print(df2.values)
"""
[[1 Timestamp('2012-02-02 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2012-02-02 00:00:00') 1.0 3 'train' 'foo']
[1 Timestamp('2012-02-02 00:00:00') 1.0 3 'test' 'foo']
[1 Timestamp('2012-02-02 00:00:00') 1.0 3 'train' 'foo']]
"""
print(df2.describe())
"""
A C D
count 4.0 4.0 4.0
mean 1.0 1.0 3.0
std 0.0 0.0 0.0
min 1.0 1.0 3.0
25% 1.0 1.0 3.0
50% 1.0 1.0 3.0
75% 1.0 1.0 3.0
max 1.0 1.0 3.0
"""
print(df2.T)
print(df2.sort_index(axis=1, ascending=False))
"""
F E D C B A
0 foo test 3 1.0 2012-02-02 1
1 foo train 3 1.0 2012-02-02 1
2 foo test 3 1.0 2012-02-02 1
3 foo train 3 1.0 2012-02-02 1
"""
print(df2.sort_values(by='E'))
"""
A B C D E F
0 1 2012-02-02 1.0 3 test foo
2 1 2012-02-02 1.0 3 test foo
1 1 2012-02-02 1.0 3 train foo
3 1 2012-02-02 1.0 3 train foonn
"""
1-2 数据处理
date = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)), index=date, columns=['A', 'B', 'C', 'D'])
print(df)
print(df['A'])
print(df.A)
print(df[0:3])
print(df['2013-01-04':'2013-01-06'])
print(df.loc['2013-01-06'])
"""
A 20
B 21
C 22
D 23
Name: 2013-01-06 00:00:00, dtype: int32
"""
print(df.loc[:,['A', 'B']])
"""
A B
2013-01-01 0 1
2013-01-02 4 5
2013-01-03 8 9
2013-01-04 12 13
2013-01-05 16 17
2013-01-06 20 21
"""
print(df.loc['2013-01-03', ['A', 'B']])
"""
A 8
B 9
Name: 2013-01-03 00:00:00, dtype: int32
"""
print(df.iloc[[1, 3, 5], 1:3])
"""
B C
2013-01-02 5 6
2013-01-04 13 14
2013-01-06 21 22
"""
print(df)
print(df<8)
print(df[df.A<8])
"""
A B C D
2013-01-01 True True True True
2013-01-02 True True True True
2013-01-03 False False False False
2013-01-04 False False False False
2013-01-05 False False False False
2013-01-06 False False False False
A B C D
2013-01-01 0 1 2 3
2013-01-02 4 5 6 7
"""
1-3 设置数据值
date = pd.date_range('20100101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=date, columns=['A', 'B', 'C', 'D'])
print(df)
df.iloc[2,2] = 111
print(df)
df.loc['20100103', 'B'] = 999
print(df)
"""
A B C D
2010-01-01 0 1 2 3
2010-01-02 0 0 0 0
2010-01-03 0 0 0 0
2010-01-04 0 0 0 0
2010-01-05 0 0 0 0
2010-01-06 0 0 0 0
"""
"""
A B C D
2010-01-01 0 1 2 3
2010-01-02 0 5 6 7
2010-01-03 0 999 111 11
2010-01-04 0 13 14 15
2010-01-05 0 17 18 19
2010-01-06 0 21 22 23
"""
df['F'] = np.nan
print(df)
"""
A B C D F
2010-01-01 0 1 2 3 NaN
2010-01-02 4 5 6 7 NaN
2010-01-03 8 999 111 11 NaN
2010-01-04 12 13 14 15 NaN
2010-01-05 16 17 18 19 NaN
2010-01-06 20 21 22 23 NaN
"""
df['E'] = pd.Series([1,2,3,4,5,6], index=pd.date_range('20100101', periods=6))
print(df)
A B C D F E
2010-01-01 0 1 2 3 NaN 1
2010-01-02 4 5 6 7 NaN 2
2010-01-03 8 999 111 11 NaN 3
2010-01-04 12 13 14 15 NaN 4
2010-01-05 16 17 18 19 NaN 5
2010-01-06 20 21 22 23 NaN 6
1-4 处理丢失的数据
import numpy as np
import pandas as pd
date = pd.date_range('20100101', periods=6)
df = pd.DataFrame(np.arange(24).reshape((6, 4)), index=date, columns=['A', 'B', 'C', 'D'])
print(df)
df.iloc[0, 1] = np.nan
df.iloc[1, 2] = np.nan
print(df)
print(df.dropna(axis=0, how='any'))
print(df.dropna(axis=1, how='any'))
"""
A B C D
2010-01-03 8 9.0 10.0 11
2010-01-04 12 13.0 14.0 15
2010-01-05 16 17.0 18.0 19
2010-01-06 20 21.0 22.0 23
A D
2010-01-01 0 3
2010-01-02 4 7
2010-01-03 8 11
2010-01-04 12 15
2010-01-05 16 19
2010-01-06 20 23
"""
print(df.fillna(value=0))
"""
A B C D
2010-01-01 0 0.0 2.0 3
2010-01-02 4 5.0 0.0 7
2010-01-03 8 9.0 10.0 11
2010-01-04 12 13.0 14.0 15
2010-01-05 16 17.0 18.0 19
2010-01-06 20 21.0 22.0 23
"""
print(df.isnull())
"""
A B C D
2010-01-01 False True False False
2010-01-02 False False True False
2010-01-03 False False False False
2010-01-04 False False False False
2010-01-05 False False False False
2010-01-06 False False False False
"""
print(np.any(df.isnull()))
1-5 导入与导出数据
data = pd.read_csv('C:/Users/liyuelong/Desktop/student.csv')
print(data)
data.to_pickle('C:/Users/liyuelong/Desktop/student.pickle')
1-6 数据合并 concatenating 原始数据准备
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3,4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3,4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3,4))*2, columns=['a', 'b', 'c', 'd'])
print(df1)
print(df2)
print(df3)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
a b c d
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
a b c d
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
"""
""
res = pd.concat([df1, df2, df3], axis=0, ignore_index=True)
print(res)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
"""
res = pd.concat([df1, df2, df3], axis=0)
print(res)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
0 1.0 1.0 1.0 1.0
1 1.0 1.0 1.0 1.0
2 1.0 1.0 1.0 1.0
0 2.0 2.0 2.0 2.0
1 2.0 2.0 2.0 2.0
2 2.0 2.0 2.0 2.0
"""
dfj1 = pd.DataFrame(np.ones((3,4))*0, columns=['a', 'b', 'c', 'd'], index = [1, 2, 3])
dfj2 = pd.DataFrame(np.ones((3,4))*1, columns=['b', 'c', 'd', 'e'], index = [2, 3, 4])
print(dfj1)
print(dfj2)
res1 = pd.concat([dfj1, dfj2], join='inner', ignore_index=True)
print(res1)
"""
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
3 1.0 1.0 1.0
4 1.0 1.0 1.0
5 1.0 1.0 1.
"""
res1 = pd.concat([dfj1, dfj2], join='outer', ignore_index=True)
print(res1)
"""
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
3 NaN 1.0 1.0 1.0 1.0
4 NaN 1.0 1.0 1.0 1.0
5 NaN 1.0 1.0 1.0 1.0
"""
res1 = pd.concat([dfj1, dfj2], axis=1)
print(res1)
"""
a b c d b c d e
1 0.0 0.0 0.0 0.0 NaN NaN NaN NaN
2 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
3 0.0 0.0 0.0 0.0 1.0 1.0 1.0 1.0
4 NaN NaN NaN NaN 1.0 1.0 1.0 1.0
"""
dfj3 = pd.DataFrame(np.ones((3,4))*0, columns=['a', 'b', 'c', 'd'])
dfj4 = pd.DataFrame(np.ones((3,4))*1, columns=['a', 'b', 'c', 'd'])
dfj5 = pd.DataFrame(np.ones((3,4))*0, columns=['b', 'c', 'd', 'e'], index = [2, 3, 4])
print(dfj3)
print(dfj4)
res2 = dfj3.append(dfj4, ignore_index = True)
print(res2)
"""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
"""
res3 = dfj3.append([dfj4, dfj5])
print(res3)
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
res4 = dfj3.append(s1, ignore_index=True)
print(res4)
""""
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 2.0 3.0 4.0
"""
|