Pandas 库是一个免费、开源的第三方 Python 库,是 Python 数据分析必不可少的工具之一,它为 Python 数据分析提供了高性能,且易于使用的数据结构,即 Series 和 DataFrame。Pandas 库基于 Python NumPy 库开发而来,因此,它可以与 Python 的科学计算库配合使用。Pandas 提供了两种数据结构,分别是 Series(一维数组结构)与 DataFrame(二维数组结构),这两种数据结构极大地增强的了 Pandas 的数据分析能力。
中文教程:http://c.biancheng.net/pandas/
官方文档:https://pandas.pydata.org/docs/
样例
代码
import pandas as pd
import numpy as np
import random as rd
import matplotlib.pyplot as plt
sr = pd.Series([20,30,50,70,25], index=[0, 1, 2, 3, 4])
print('\nlist = \n',sr.to_list())
print('\ndict = \n',sr.to_dict())
print('\nnumpy = \n',sr.to_numpy())
print('\njson = \n',sr.to_json())
print('\nlatex = \n',sr.to_latex())
print('\nmarkdown = \n',sr.to_markdown())
print('\nunique = \n',sr.unique())
df = pd.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
"Allen, Mr. Elizabeth",
"Braund, Miss. Elizabeth",
],
"Age": [22, 35, 58, 12, 11],
"Sex": ["male", "male", "female", "male", "female"],
}
)
print('\ndict = \n',df.to_dict())
print('\nnumpy = \n',df.to_numpy())
print('\njson = \n',df.to_json())
print('\nlatex = \n',df.to_latex())
print('\nmarkdown = \n',df.to_markdown())
print('\n属性:')
df.info()
print('\nshape = ',df.shape)
print('\ndf = \n',df)
print('\nsr = \n',sr)
df['Age'] = sr
df.Age = sr
df['Age2'] = 2*df.Age - df.Age
print('\ndf = \n',df)
df = df.rename(
columns={
"Age2": "age",
}
)
print('\ndf = \n',df)
df.Age += 0.5
print('\ndf = \n',df)
df.to_excel('./data.xlsx', sheet_name="123", index=False)
df2 = pd.read_excel('./data.xlsx', sheet_name="123")
print('\ndf2 = \n',df2)
print('\ndf2.head(2) = \n',df2.head(2))
age_sex = df[["Age", "Sex"]]
print('\nage - sex = \n',age_sex)
df.iloc[1:4, 1] = 123
print('\ndf = \n',df)
age25 = df[df["Age"] > 25]
print('\nage > 25 = \n',age25)
print('\nvalue_counts = \n',df["Age"].value_counts(),'\n')
print('max = ',df["Age"].max())
print('maxargmin = ',df["Age"].argmin())
print('\ndescribe = \n', type(df.describe()), '\n',df.describe())
print('mean = ',df["Age"].mean())
print('\nmean_by_Sex = \n',df.groupby("Sex")["Age"].mean())
print('\nprefix = \n',df['Age'].add_prefix(123))
print('\nsuffix = \n',df['Age'].add_suffix('a'))
df3 = df.sort_values(by=["Name","Age"])
print('\nsort(df) = \n',df3)
df3 = df.sort_values(by="Age", key=lambda sr: abs(sr-60))
print('\nsort(df) = \n',df3)
df3 = df.set_index(['Name','age'])
print('\ndf.set_index = \n',df3)
df3 = df.pivot(index="Name",columns="Sex", values=["Age","age"])
print('\ndf.pivot = \n',df3)
df3 = df.pivot_table(values="age", index="Age", columns="Sex", aggfunc="mean", margins=True)
print('\ndf.pivot_table = \n',df3)
df3 = df3.reset_index()
print('\ndf.pivot_table.reset_index = \n',df3)
df3 = df.melt(id_vars="Name")
print('\ndf.melt = \n',df3)
df3 = pd.concat([df, df], axis=0)
print('\nconcat = \n',df3)
df3 = pd.concat([df, df], keys=["PM25", "NO2"])
print('\nconcat = \n',df3)
df2 = pd.DataFrame(
{
"Name": [
"Braund, Mr. Owen Harris",
"Allen, Mr. William Henry",
"Bonnell, Miss. Elizabeth",
"Allen, Mr. Elizabeth",
"Braund, Miss. Elizabeth",
],
"Work": [0,0,1,1,0],
"Address": [3,1,5,3,5],
"Time": [
'2019-06-21 00:00:00+00:00',
'2019-06-20 23:00:00+00:00',
'2019-06-19 22:00:00+00:00',
'2019-06-22 01:00:00+00:00',
'2019-06-20 09:00:00+00:00',
]
}
)
df3 = pd.merge(df, df2, how="left", on="Name")
print('\nmerge = \n',df3)
print(pd.to_datetime(df2['Time']))
df3 = df.replace({"male": "M", "female": "F"})
print('\ndf = \n',df3)
df.plot(x="Sex", y="Age", c='b', linestyle='--')
df.plot.scatter(x="Sex", y="Age", c='r', marker='*')
plt.show()
测试结果
list =
[20, 30, 50, 70, 25]
dict =
{0: 20, 1: 30, 2: 50, 3: 70, 4: 25}
numpy =
[20 30 50 70 25]
json =
{"0":20,"1":30,"2":50,"3":70,"4":25}
latex =
\begin{tabular}{lr}
\toprule
{} & 0 \\
\midrule
0 & 20 \\
1 & 30 \\
2 & 50 \\
3 & 70 \\
4 & 25 \\
\bottomrule
\end{tabular}
markdown =
| | 0 |
|---:|----:|
| 0 | 20 |
| 1 | 30 |
| 2 | 50 |
| 3 | 70 |
| 4 | 25 |
unique =
[20 30 50 70 25]
dict =
{'Name': {0: 'Braund, Mr. Owen Harris', 1: 'Allen, Mr. William Henry', 2: 'Bonnell, Miss. Elizabeth', 3: 'Allen, Mr. Elizabeth', 4: 'Braund, Miss. Elizabeth'}, 'Age': {0: 22, 1: 35, 2: 58, 3: 12, 4: 11}, 'Sex': {0: 'male', 1: 'male', 2: 'female', 3: 'male', 4: 'female'}}
numpy =
[['Braund, Mr. Owen Harris' 22 'male']
['Allen, Mr. William Henry' 35 'male']
['Bonnell, Miss. Elizabeth' 58 'female']
['Allen, Mr. Elizabeth' 12 'male']
['Braund, Miss. Elizabeth' 11 'female']]
json =
{"Name":{"0":"Braund, Mr. Owen Harris","1":"Allen, Mr. William Henry","2":"Bonnell, Miss. Elizabeth","3":"Allen, Mr. Elizabeth","4":"Braund, Miss. Elizabeth"},"Age":{"0":22,"1":35,"2":58,"3":12,"4":11},"Sex":{"0":"male","1":"male","2":"female","3":"male","4":"female"}}
latex =
\begin{tabular}{llrl}
\toprule
{} & Name & Age & Sex \\
\midrule
0 & Braund, Mr. Owen Harris & 22 & male \\
1 & Allen, Mr. William Henry & 35 & male \\
2 & Bonnell, Miss. Elizabeth & 58 & female \\
3 & Allen, Mr. Elizabeth & 12 & male \\
4 & Braund, Miss. Elizabeth & 11 & female \\
\bottomrule
\end{tabular}
markdown =
| | Name | Age | Sex |
|---:|:-------------------------|------:|:-------|
| 0 | Braund, Mr. Owen Harris | 22 | male |
| 1 | Allen, Mr. William Henry | 35 | male |
| 2 | Bonnell, Miss. Elizabeth | 58 | female |
| 3 | Allen, Mr. Elizabeth | 12 | male |
| 4 | Braund, Miss. Elizabeth | 11 | female |
属性:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 5 non-null object
1 Age 5 non-null int64
2 Sex 5 non-null object
dtypes: int64(1), object(2)
memory usage: 248.0+ bytes
shape = (5, 3)
df =
Name Age Sex
0 Braund, Mr. Owen Harris 22 male
1 Allen, Mr. William Henry 35 male
2 Bonnell, Miss. Elizabeth 58 female
3 Allen, Mr. Elizabeth 12 male
4 Braund, Miss. Elizabeth 11 female
sr =
0 20
1 30
2 50
3 70
4 25
dtype: int64
df =
Name Age Sex Age2
0 Braund, Mr. Owen Harris 20 male 20
1 Allen, Mr. William Henry 30 male 30
2 Bonnell, Miss. Elizabeth 50 female 50
3 Allen, Mr. Elizabeth 70 male 70
4 Braund, Miss. Elizabeth 25 female 25
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20 male 20
1 Allen, Mr. William Henry 30 male 30
2 Bonnell, Miss. Elizabeth 50 female 50
3 Allen, Mr. Elizabeth 70 male 70
4 Braund, Miss. Elizabeth 25 female 25
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
2 Bonnell, Miss. Elizabeth 50.5 female 50
3 Allen, Mr. Elizabeth 70.5 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
df2 =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
2 Bonnell, Miss. Elizabeth 50.5 female 50
3 Allen, Mr. Elizabeth 70.5 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
df2.head(2) =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 30.5 male 30
age - sex =
Age Sex
0 20.5 male
1 30.5 male
2 50.5 female
3 70.5 male
4 25.5 female
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
age > 25 =
Name Age Sex age
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
value_counts =
123.0 3
20.5 1
25.5 1
Name: Age, dtype: int64
max = 123.0
maxargmin = 0
describe =
<class 'pandas.core.frame.DataFrame'>
Age age
count 5.000000 5.000000
mean 83.000000 39.000000
std 54.800776 20.736441
min 20.500000 20.000000
25% 25.500000 25.000000
50% 123.000000 30.000000
75% 123.000000 50.000000
max 123.000000 70.000000
mean = 83.0
mean_by_Sex =
Sex
female 74.250000
male 88.833333
Name: Age, dtype: float64
prefix =
1230 20.5
1231 123.0
1232 123.0
1233 123.0
1234 25.5
Name: Age, dtype: float64
suffix =
0a 20.5
1a 123.0
2a 123.0
3a 123.0
4a 25.5
Name: Age, dtype: float64
sort(df) =
Name Age Sex age
3 Allen, Mr. Elizabeth 123.0 male 70
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
sort(df) =
Name Age Sex age
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
df.set_index =
Age Sex
Name age
Braund, Mr. Owen Harris 20 20.5 male
Allen, Mr. William Henry 30 123.0 male
Bonnell, Miss. Elizabeth 50 123.0 female
Allen, Mr. Elizabeth 70 123.0 male
Braund, Miss. Elizabeth 25 25.5 female
df.pivot =
Age age
Sex female male female male
Name
Allen, Mr. Elizabeth NaN 123.0 NaN 70.0
Allen, Mr. William Henry NaN 123.0 NaN 30.0
Bonnell, Miss. Elizabeth 123.0 NaN 50.0 NaN
Braund, Miss. Elizabeth 25.5 NaN 25.0 NaN
Braund, Mr. Owen Harris NaN 20.5 NaN 20.0
df.pivot_table =
Sex female male All
Age
20.5 NaN 20.0 20.0
25.5 25.0 NaN 25.0
123.0 50.0 50.0 50.0
All 37.5 40.0 39.0
df.pivot_table.reset_index =
Sex Age female male All
0 20.5 NaN 20.0 20.0
1 25.5 25.0 NaN 25.0
2 123.0 50.0 50.0 50.0
3 All 37.5 40.0 39.0
df.melt =
Name variable value
0 Braund, Mr. Owen Harris Age 20.5
1 Allen, Mr. William Henry Age 123.0
2 Bonnell, Miss. Elizabeth Age 123.0
3 Allen, Mr. Elizabeth Age 123.0
4 Braund, Miss. Elizabeth Age 25.5
5 Braund, Mr. Owen Harris Sex male
6 Allen, Mr. William Henry Sex male
7 Bonnell, Miss. Elizabeth Sex female
8 Allen, Mr. Elizabeth Sex male
9 Braund, Miss. Elizabeth Sex female
10 Braund, Mr. Owen Harris age 20
11 Allen, Mr. William Henry age 30
12 Bonnell, Miss. Elizabeth age 50
13 Allen, Mr. Elizabeth age 70
14 Braund, Miss. Elizabeth age 25
concat =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
concat =
Name Age Sex age
PM25 0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
NO2 0 Braund, Mr. Owen Harris 20.5 male 20
1 Allen, Mr. William Henry 123.0 male 30
2 Bonnell, Miss. Elizabeth 123.0 female 50
3 Allen, Mr. Elizabeth 123.0 male 70
4 Braund, Miss. Elizabeth 25.5 female 25
merge =
Name Age Sex age Work Address Time
0 Braund, Mr. Owen Harris 20.5 male 20 0 3 2019-06-21 00:00:00+00:00
1 Allen, Mr. William Henry 123.0 male 30 0 1 2019-06-20 23:00:00+00:00
2 Bonnell, Miss. Elizabeth 123.0 female 50 1 5 2019-06-19 22:00:00+00:00
3 Allen, Mr. Elizabeth 123.0 male 70 1 3 2019-06-22 01:00:00+00:00
4 Braund, Miss. Elizabeth 25.5 female 25 0 5 2019-06-20 09:00:00+00:00
0 2019-06-21 00:00:00+00:00
1 2019-06-20 23:00:00+00:00
2 2019-06-19 22:00:00+00:00
3 2019-06-22 01:00:00+00:00
4 2019-06-20 09:00:00+00:00
Name: Time, dtype: datetime64[ns, UTC]
df =
Name Age Sex age
0 Braund, Mr. Owen Harris 20.5 M 20
1 Allen, Mr. William Henry 123.0 M 30
2 Bonnell, Miss. Elizabeth 123.0 F 50
3 Allen, Mr. Elizabeth 123.0 M 70
4 Braund, Miss. Elizabeth 25.5 F 25
|