import pandas as pd
pandas总结
载入CSV文件,并展示
df = pd.read_csv('1ttdata.csv')
df
展示元数据
df.info()
type(df)
df.index
支持的读取方式
df.head()
df.tail()
df.values[0]
df.keys()
df['XXXkey']
df['Age'].head(10)[:5]
dg = df.set_index('Name')
#通过一个volume或者index
dg = df.set_index('Name')
test1 = dg['age']
test1['Braund, Mr. Owen Harris']
test1.mean()
test1.min()
test1.max()
自定义DataFrame
data = {'country':['aaa','bbb','ccc'],
'population':[10,12,14]}
df_data = pd.DataFrame(data)
df_data
df_date.describe()
显示图表的5行,并展示2到3列
df.iloc[:5,1:3]
设置Name列为索引
df = df.set_index('Name')
df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)']
df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)','Age']
根据索引拿出部分
df.loc['Cumings, Mrs. John Bradley (Florence Briggs Thayer)':'Allen, Mr. William Henry']
df.loc['Heikkinen, Miss. Laina','Age'] = 50
df['Age'] > 30
df.loc[df['Sex'] == 'male' ,'Age'].mean()
df.iloc[1]
t1.loc[1,'Age'] = 50
t1 = df.copy()
打印出名称和年龄
df = pd.DataFrame({'name':['A','B','C','A','A','B'],
'age':[8,7,9,6,5,4]})
for name in ['A','B','C']:
print(name,df[df['name'] == name])
根据性别分类分别去求年龄的平均值
df.groupby('Sex')['Age'].mean()
自定义数组
df = pd.DataFrame([[1,2,3],[4,5,6]],index = ['a','b'],columns = ['A','B','C'])
df.sum()
df.sum( axis = 1 )
df.sum(axis = 'columns')
df['Age'].value_counts()
df.cov()
df.corr()
df['Age'].value_counts()
df['Age'].value_counts(ascending = True)
df['Age'].value_counts(ascending = True, bins=5)
df['Age'].count()
s1 = df.append(s2,ignore_index=True)
del s1['k']
s1.drop(891)
s1.drop(2,inplace = True)
df.iloc[0]
df.loc['a']
改操作
df.loc['a']['A'] = 150
进行拼接
df3 = pd.concat([s1,s2],axis=1)
增加列,也可以,axis=1 去拼接
df['test'] = [xxx,xxx,xxx,...]
del df['Age']
df3.drop('h',axis=0,inplace = True)
df3.drop(['Age','Cabin'],axis=1)
构造一个多维表
left = pd.DataFrame({'key':['KO','K1','K3'],
'A':['A1','A2','A3'],
'B':['B1','B2','B3']})
right = pd.DataFrame({'key':['KO','K1','K3'],
'C':['C1','C2','C3'],
'D':['D1','D2','D3']})
pd.merge(left,right)
pd.merge(left,right,on='key')
'''
key A B key2_x C D key2_y
0 KO A1 B1 K0 C1 D1 K0
1 K1 A2 B2 K2 C2 D2 K2
2 K3 A3 B3 K3 C3 D3 K3
'''
pd.merge(left,right,on=['key','key2'])
特殊情况
left = pd.DataFrame({'key':['KO','K1','K3'],
'A':['A1','A2','A3'],
'B':['B1','B2','B3'],
'key2':['K0','K2','K3']})
right = pd.DataFrame({'key':['KO','K1','K3'],
'C':['C1','C2','C3'],
'D':['D1','D2','D3'],
'key2':['K0','K2','K4']})
pd.merge(left,right,on=['key','key2'])
pd.merge(left,right,on=['key','key2'],how='outer')
'''
key A B key2 C D
0 KO A1 B1 K0 C1 D1
1 K1 A2 B2 K2 C2 D2
2 K3 A3 B3 K3 NaN NaN
3 K3 NaN NaN K4 C3 D3
'''
pd.merge(left,right,on=['key','key2'],how='outer',indicator = True)
pd.set_option('display.max_rows',100)
pd.Series(index = range(0,100) )
pd.set_option('display.max_columns',30)
a = pd.DataFrame(columns = range(0,40))
pd.get_option('display.max_colwidth')
pd.Series(index = ['A'],data=['t'* 60])
pd.get_option('display.precision')
pd.set_option('display.precision',10)
数据透视表
a.pivot(index = 'A',columns = 'C',values = 'D')
生成表格
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare')
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='mean')
df.pivot_table(index = 'Sex',columns='Pclass',values='Fare',aggfunc='count')
pd.crosstab(index = df['Sex'],columns = df['Pclass'])
df['Underaged'] = df['Age'] <= 18
df.pivot_table(index = 'Underaged',columns='Sex',values='Survived',aggfunc='sum')
df.pivot_table(index = 'Underaged',columns='Pclass',values='Fare',aggfunc='count')
df[df['Underaged']]
a = df[df['Underaged']]
a[a['Survived'] == 1]
pandas时间和datetime时间的区别
a = datetime.datetime(year=2020,month=12,day=24,hour=10,minute=30)
b = pd.Timestamp('2020-12-23-10:30')
a.day
b.day
b + pd.Timedelta('50 0000 0000')
b + pd.Timedelta('5 minute')
s = pd.Series(['2017-11-24 00:00:00','2017-11-25 00:00:00','2017-11-26 00:00:00'])
ts = pd.to_datetime(s)
'''
0 2017-11-24
1 2017-11-25
2 2017-11-26
dtype: datetime64[ns]
'''
ts.dt.day
a = list(ts.dt.day)
pandas时间的对时间表格的使用
pd.Series(pd.date_range(start='2022-5-12',periods = 10,freq = '24H'))
data = pd.read_csv('flowdata.csv')
data['Time'] = pd.to_datetime(data['Time'])
data = pd.read_csv('flowdata.csv',index_col = 0 ,parse_dates = True)
data[pd.Timestamp('2012-01-01 09:00'):pd.Timestamp('2012-01-01 19:00')]
data['2012-01':'2012-02']
data[data.index.month == 1]
data[(data.index.day == 1) & (data.index.month == 1)]
data.resample('D').mean()
data.resample('3D').max()
%matplotlib notebook
data.resample('M').mean().plot()
condas常用 的命令
进行排序
data.sort_values(by=['group','data'],ascending = [False,True])
data.sort_values(by='data')
data.drop_duplicates().count()
data.drop_duplicates(subset='group')
def food_map(series):
if series['group'] == 'a':
return 'A'
elif series['group'] == 'b':
return 'B'
data['food_map'] = data.apply(food_map,axis = 'columns')
data
data.apply(print,axis = 'columns')
对数据快速转换
Upper_dict = {
'通州':'河北',}
'廊坊':'河北',
'朝阳':'北京'
}
data['new_flat'] = data['flat'].map(Upper_dict)
df.assign(ration = df['data1']/df['data2'])
df.drop('ration',axis='columns',inplace=True)
ages = [15,18,20,21,22,34,41,52,53,63,79]
bins = [10,20,30,40,50,90]
bins_res = pd.cut(ages,bins)
bins_res
'''
outs:
[(10, 20], (10, 20], (10, 20], (20, 30], (20, 30], ..., (40, 50], (50, 90], (50, 90], (50, 90], (50, 90]]
Length: 11
Categories (5, interval[int64]): [(10, 20] < (20, 30] < (30, 40] < (40, 50] < (50, 90]]
'''
bins_res.codes
'''
outs:
array([0, 0, 0, 1, 1, 2, 3, 4, 4, 4, 4], dtype=int8)
'''
pd.cut(ages,[10,30,50,80]).codes
'''
array([0, 0, 0, 0, 0, 1, 1, 2, 2, 2, 2], dtype=int8)
'''
group_names = ['youth','old']
a = pd.cut(ages,[10,20,80],labels=group_names)
pd.value_counts(a)
'''
old 8
youth 3
dtype: int64
'''
condas 缺失值的 应用
df = pd.DataFrame([range(3),[0, np.nan ,0], [0,0,np.nan],range(3) ])
df.isnull()
df.isnull().any()
df.isnull().any(axis = 1)
df[df.isnull().any(axis=1)]
0 1 2 0 0 1.0 2.0 1 0 NaN 0.0 2 0 0.0 NaN 3 0 1.0 2.0
groupby
df = pd.DataFrame({'A':['foo','foo','foo','bar',
'foo','bar','foo','bar'],
'B':['one','two','one','two',
'one','two','one','one'],
'C':np.random.randn(8),
'D':np.random.randn(8)})
df.groupby(['A','B']).count()
s = pd.Series([1,2,3,4,3,2],[8,7,6,6,4,5])
grouped = s.groupby(level = 0,sort = False )
grouped.count()
grouped.sum()
grouped.last()
grouped.first()
修改索引的名称
df.groupby(['X']).get_group('A')
index = pd.MultiIndex.from_arrays(arrays,name = ['first','second'])
s = pd.Series(np.random.randn(8),index = index )
s.groupby(level = 'first')
进行细化计算
groupd = df.groupby(['A','B'])
groupd.aggregate[np.sum]
df.groupby(['A','B']).sum().reset_index()
grouped.describe().head()
grouped = df.groupby('A')
grouped['C'].agg([np.sum,np.mean,np.std])
|