numpy能够帮助我们处理数值,但是pandas除了处理数值之外(基于numpy),还能够帮助我们处理其他类型的数据(字符串、时间序列等等)
Series
import pandas as pd
import numpy as np
import string
t=pd.Series(np.arange(10),index=list(string.ascii_uppercase[:10]))
t
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
type(t)
pandas.core.series.Series
a={string.ascii_uppercase[i]:i for i in range(10)}
a
{'A': 0,
'B': 1,
'C': 2,
'D': 3,
'E': 4,
'F': 5,
'G': 6,
'H': 7,
'I': 8,
'J': 9}
pd.Series(a)
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
pd.Series(a,index=list(string.ascii_uppercase[5:15]))
F 5.0
G 6.0
H 7.0
I 8.0
J 9.0
K NaN
L NaN
M NaN
N NaN
O NaN
dtype: float64
t
A 0
B 1
C 2
D 3
E 4
F 5
G 6
H 7
I 8
J 9
dtype: int64
t["F"]
5
t[1]
1
t[[2,4,5]]
C 2
E 4
F 5
dtype: int64
t[["A","G"]]
A 0
G 6
dtype: int64
t.index
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype='object')
t.values
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
type(t.index)
pandas.core.indexes.base.Index
type(t.values)
numpy.ndarray
DataFrame
tdf=pd.DataFrame(np.arange(12).reshape((3,4)))
tdf
tdf.shape
(3, 4)
tdf.dtypes
0 int64
1 int64
2 int64
3 int64
dtype: object
tdf.ndim
2
tdf.index
RangeIndex(start=0, stop=3, step=1)
tdf.columns
RangeIndex(start=0, stop=4, step=1)
tdf.values
array([[ 0, 1, 2, 3],
[ 4, 5, 6, 7],
[ 8, 9, 10, 11]])
tdf.head(2)
tdf.tail(3)
tdf.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 0 3 non-null int64
1 1 3 non-null int64
2 2 3 non-null int64
3 3 3 non-null int64
dtypes: int64(4)
memory usage: 224.0 bytes
df = pd.read_csv("./day04/code/dogNames2.csv")
print(df[(800<df["Count_AnimalName"])|(df["Count_AnimalName"]<1000)],'\n')
print(df.info())
Row_Labels Count_AnimalName
0 1 1
1 2 2
2 40804 1
3 90201 1
4 90203 1
... ... ...
16215 37916 1
16216 38282 1
16217 38583 1
16218 38948 1
16219 39743 1
[16220 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16220 entries, 0 to 16219
Data columns (total 2 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row_Labels 16217 non-null object
1 Count_AnimalName 16220 non-null int64
dtypes: int64(1), object(1)
memory usage: 253.6+ KB
None
DataFrame是由Series组成的
df[:20]
| Row_Labels | Count_AnimalName |
---|
0 | 1 | 1 |
---|
1 | 2 | 2 |
---|
2 | 40804 | 1 |
---|
3 | 90201 | 1 |
---|
4 | 90203 | 1 |
---|
5 | 102201 | 1 |
---|
6 | 3010271 | 1 |
---|
7 | MARCH | 2 |
---|
8 | APRIL | 51 |
---|
9 | AUGUST | 14 |
---|
10 | DECEMBER | 4 |
---|
11 | SUNDAY | 13 |
---|
12 | MONDAY | 4 |
---|
13 | FRIDAY | 19 |
---|
14 | JAN | 1 |
---|
15 | JUN | 1 |
---|
16 | JANUARY | 1 |
---|
17 | JUNE | 24 |
---|
18 | JULY | 9 |
---|
19 | MON | 2 |
---|
df["Count_AnimalName"]
0 1
1 2
2 1
3 1
4 1
..
16215 1
16216 1
16217 1
16218 1
16219 1
Name: Count_AnimalName, Length: 16220, dtype: int64
type(df["Count_AnimalName"])
pandas.core.series.Series
pandas的loc和iloc
df.loc[1,"Row_Labels"]
'2'
df.loc[[1,2,4,5],["Row_Labels","Count_AnimalName"]]
| Row_Labels | Count_AnimalName |
---|
1 | 2 | 2 |
---|
2 | 40804 | 1 |
---|
4 | 90203 | 1 |
---|
5 | 102201 | 1 |
---|
df.loc[[4],["Row_Labels","Count_AnimalName"]]
| Row_Labels | Count_AnimalName |
---|
4 | 90203 | 1 |
---|
df.loc[1:8,["Row_Labels","Count_AnimalName"]]
| Row_Labels | Count_AnimalName |
---|
1 | 2 | 2 |
---|
2 | 40804 | 1 |
---|
3 | 90201 | 1 |
---|
4 | 90203 | 1 |
---|
5 | 102201 | 1 |
---|
6 | 3010271 | 1 |
---|
7 | MARCH | 2 |
---|
8 | APRIL | 51 |
---|
df.iloc[1:6,[0,1]]
| Row_Labels | Count_AnimalName |
---|
1 | 2 | 2 |
---|
2 | 40804 | 1 |
---|
3 | 90201 | 1 |
---|
4 | 90203 | 1 |
---|
5 | 102201 | 1 |
---|
df[df["Count_AnimalName"]>999]
| Row_Labels | Count_AnimalName |
---|
1156 | BELLA | 1195 |
---|
9140 | MAX | 1153 |
---|
df[(df["Count_AnimalName"]>700)&(df["Row_Labels"].str.len()>=4)]
| Row_Labels | Count_AnimalName |
---|
1156 | BELLA | 1195 |
---|
2660 | CHARLIE | 856 |
---|
3251 | COCO | 852 |
---|
8417 | LOLA | 795 |
---|
8552 | LUCKY | 723 |
---|
8560 | LUCY | 710 |
---|
12368 | ROCKY | 823 |
---|
file_path51 = "./day05/code/IMDB-Movie-Data.csv"
df51 = pd.read_csv(file_path51)
print(df51.head(1))
print('rating-meaning:',df51["Rating"].mean())
temp_actors_list = df51["Actors"].str.split(", ").tolist()
actors_list = [i for j in temp_actors_list for i in j]
actors_num = len(set(actors_list))
print(actors_num)
Rank Title Genre \
0 1 Guardians of the Galaxy Action,Adventure,Sci-Fi
Description Director \
0 A group of intergalactic criminals are forced ... James Gunn
Actors Year Runtime (Minutes) \
0 Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S... 2014 121
Rating Votes Revenue (Millions) Metascore
0 8.1 757074 333.13 76.0
rating-meaning: 6.723199999999999
2015
import pandas as pd
from matplotlib import pyplot as plt
file_path52 = "./day05/code/starbucks_store_worldwide.csv"
df52 = pd.read_csv(file_path52)
data1 = df52.groupby(by="Country").count()["Brand"].sort_values(ascending=False)[:10]
print(data1,'\n')
_x = data1.index
_y = data1.values
plt.figure(figsize=(20,8),dpi=80)
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
Country
US 13608
CN 2734
CA 1468
JP 1237
KR 993
GB 901
MX 579
TW 394
TR 326
PH 298
Name: Brand, dtype: int64
import pandas as pd
from matplotlib import pyplot as plt
file_path61 = "./day06/code/PM2.5/BeijingPM20100101_20151231.csv"
df61 = pd.read_csv(file_path61)
period = pd.PeriodIndex(year=df61["year"],month=df61["month"],day=df61["day"],hour=df61["hour"],freq="H")
df61["datetime"] = period
print('head():\n',df61.head(),'\n')
df61.set_index("datetime",inplace=True)
df61 = df61.resample("7D").mean()
print('resample:\n',df61.head())
data =df61["PM_US Post"]
data_china = df61["PM_Nongzhanguan"]
print(data_china.head())
_x = data.index
_x = [i.strftime("%Y%m%d") for i in _x]
_x_china = [i.strftime("%Y%m%d") for i in data_china.index]
print(len(_x_china),len(_x_china))
_y = data.values
_y_china = data_china.values
plt.figure(figsize=(20,8),dpi=80)
plt.plot(range(len(_x)),_y,label="US_POST",alpha=0.7)
plt.plot(range(len(_x_china)),_y_china,label="CN_POST",alpha=0.7)
plt.xticks(range(0,len(_x_china),10),list(_x_china)[::10],rotation=45)
plt.legend(loc="best")
plt.show()
head():
No year month day hour season PM_Dongsi PM_Dongsihuan \
0 1 2010 1 1 0 4 NaN NaN
1 2 2010 1 1 1 4 NaN NaN
2 3 2010 1 1 2 4 NaN NaN
3 4 2010 1 1 3 4 NaN NaN
4 5 2010 1 1 4 4 NaN NaN
PM_Nongzhanguan PM_US Post DEWP HUMI PRES TEMP cbwd Iws \
0 NaN NaN -21.0 43.0 1021.0 -11.0 NW 1.79
1 NaN NaN -21.0 47.0 1020.0 -12.0 NW 4.92
2 NaN NaN -21.0 43.0 1019.0 -11.0 NW 6.71
3 NaN NaN -21.0 55.0 1019.0 -14.0 NW 9.84
4 NaN NaN -20.0 51.0 1018.0 -12.0 NW 12.97
precipitation Iprec datetime
0 0.0 0.0 2010-01-01 00:00
1 0.0 0.0 2010-01-01 01:00
2 0.0 0.0 2010-01-01 02:00
3 0.0 0.0 2010-01-01 03:00
4 0.0 0.0 2010-01-01 04:00
resample:
No year month day hour season PM_Dongsi \
datetime
2010-01-01 84.5 2010.0 1.000000 4.000000 11.5 4.0 NaN
2010-01-08 252.5 2010.0 1.000000 11.000000 11.5 4.0 NaN
2010-01-15 420.5 2010.0 1.000000 18.000000 11.5 4.0 NaN
2010-01-22 588.5 2010.0 1.000000 25.000000 11.5 4.0 NaN
2010-01-29 756.5 2010.0 1.571429 14.285714 11.5 4.0 NaN
PM_Dongsihuan PM_Nongzhanguan PM_US Post DEWP HUMI \
datetime
2010-01-01 NaN NaN 71.627586 -18.255952 54.395833
2010-01-08 NaN NaN 69.910714 -19.035714 49.386905
2010-01-15 NaN NaN 163.654762 -12.630952 57.755952
2010-01-22 NaN NaN 68.069307 -17.404762 34.095238
2010-01-29 NaN NaN 53.583333 -17.565476 34.928571
PRES TEMP Iws precipitation Iprec
datetime
2010-01-01 1027.910714 -10.202381 43.859821 0.066667 0.786905
2010-01-08 1030.035714 -10.029762 45.392083 0.000000 0.000000
2010-01-15 1030.386905 -4.946429 17.492976 0.000000 0.000000
2010-01-22 1026.196429 -2.672619 54.854048 0.000000 0.000000
2010-01-29 1025.273810 -2.083333 26.625119 0.000000 0.000000
datetime
2010-01-01 NaN
2010-01-08 NaN
2010-01-15 NaN
2010-01-22 NaN
2010-01-29 NaN
Freq: 7D, Name: PM_Nongzhanguan, dtype: float64
313 313
|