1、字符串离散化
将字符串形式的分类情况转化为类别。
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
file_path = './IMDB-Movie-Data.csv'
data = pd.read_csv(file_path)
# print(data.head)
# print(data.info())
# print(data['Genre'].tolist())
genreListTemp = data['Genre'].tolist()
genreSet = set()
for gen in genreListTemp:
for i in gen.split(','):
genreSet.add(i)
genreList = list(genreSet)
#先生成全是0的方阵
genres = pd.DataFrame(np.zeros((data.shape[0], len(genreList))), dtype=int, columns=genreList)
print(genres)
for i in range(data.shape[0]):
gens = data['Genre'][i]
#将查到位置的位置制为1
genres.loc[i, gens.split(',')] = 1
print(genres)
print(type(genres))
data = genres.sum(axis=0).sort_values()
print(data)
_x = data.index
_y = data.values
plt.figure(figsize=(20, 8), dpi=80)
plt.bar(_x, _y)
# plt.show()
2、数据合并
import pandas as pd
import numpy as np
t1 = pd.DataFrame(np.zeros((2, 3)), index=list("ab"), columns=(list("abc")), dtype=int)
t2 = pd.DataFrame(np.ones((3, 3)), index=list("abc"), dtype=int)
print(t1)
print(t2)
#join可用于两个方阵合并,但是列一定不能重复
print(t1.join(t2))
# a b c 0 1 2
# a 0 0 0 1 1 1
# b 0 0 0 1 1 1
#谁在前面,行列以谁为准,如果不够的用NaN补齐
print(t2.join(t1))
# 0 1 2 a b c
# a 1 1 1 0.0 0.0 0.0
# b 1 1 1 0.0 0.0 0.0
# c 1 1 1 NaN NaN NaN
t3 = pd.DataFrame(np.arange(9).reshape((3, 3)), index=['a', 'b', 'c'], columns=list("ayz"))
print('t3')
print(t3)
# a y z
# a 0 1 2
# b 3 4 5
# c 6 7 8
t1.loc['a', 'a'] = 100
# t1
# a b c
# a 100 0 0
# b 0 0 0
print(t1.merge(t3, on='a', how='inner'))
# a b c y z
# 0 0 0 0 1 2
#merge相当于数据库中的内外链接,行,列必须有相同的才能连在一起。默认是内连接
#outer是外连接
print(t1.merge(t3, on='a', how='outer'))
# a b c y z
# 0 100 0.0 0.0 NaN NaN
# 1 0 0.0 0.0 1.0 2.0
# 2 3 NaN NaN 4.0 5.0
# 3 6 NaN NaN 7.0 8.0
#左右连接,分别以左右的行为准
print(t1.merge(t3, on='a', how='left'))
# a b c y z
# 0 100 0 0 NaN NaN
# 1 0 0 0 1.0 2.0
print(t1.merge(t3, on='a', how='right'))
# a b c y z
# 0 0 0.0 0.0 1 2
# 1 3 NaN NaN 4 5
# 2 6 NaN NaN 7 8
3、索引
?
?
import pandas as pd
import numpy as np
file_path = './starbucks_store_worldwide.csv'
data = pd.read_csv(file_path)
# print(data.head(1))
# print(data.info)
countryData = data.groupby(by='Country')
# for i in countryData:
# print('*' * 100)
# print(i)
# print(countryData)
country_count = countryData["Brand"].count()
# print(country_count)
# print(country_count['US'])
# print(country_count['CN'])
chinaData = data[data['Country'] == 'CN']
pros = chinaData.groupby(by='State/Province').count()['Brand']
# print(pros)
#多个分组条件的话,使用复合索引
grouped1 = data[['Brand']].groupby(by=[data["Country"], data["State/Province"]]).count()
grouped2= data.groupby(by=[data["Country"],data["State/Province"]])[["Brand"]].count()
grouped3 = data.groupby(by=[data["Country"],data["State/Province"]]).count()[["Brand"]]
print(grouped1)
print(t1)
# a b c
# a 100 0 0
# b 0 0 0
print(t1.set_index('a'))
# b c
# a
# 100 0 0
# 0 0 0
print(t1.set_index('a', drop=False))
# a b c
# a
# 100 100 0 0
# 0 0 0 0
print(t1.set_index(['a', 'b'], drop=False))
# a b c
# a b
# 100 0 100 0 0
# 0 0 0 0 0
4、复合索引
a = pd.DataFrame({'a': range(7),'b': range(7, 0, -1),'c': ['one','one','one','two','two','two', 'two'],'d': list("hjklmno")})
print(a)
# a b c d
# 0 0 7 one h
# 1 1 6 one j
# 2 2 5 one k
# 3 3 4 two l
# 4 4 3 two m
# 5 5 2 two n
# 6 6 1 two o
b = a.set_index(['c', 'd'])
print(b)
# a b
# c d
# one h 0 7
# j 1 6
# k 2 5
# two l 3 4
# m 4 3
# n 5 2
# o 6 1
c = b['a']
print(c)
# c d
# one h 0
# j 1
# k 2
# two l 3
# m 4
# n 5
# o 6
print(c['one'])
# d
# h 0
# j 1
# k 2
print(c['one']['h'])
#serries取的话直接取就行
# 0
d = a.set_index(['d', 'c'])['a']
print(d)
# d c
# h one 0
# j one 1
# k one 2
# l two 3
# m two 4
# n two 5
# o two 6
# #把d索引放前面,c索引放后面,使用swaplevel就可以置换了
print(d.swaplevel())
# c d
# one h 0
# j 1
# k 2
# two l 3
# m 4
# n 5
# o 6
#关于DataFrame的获取数据
#但是需要使用loc方法,不然会被认为是字符串b['one']
print(b.loc['one'].loc['h'])
# a 0
# b 7
print(b.swaplevel().loc['h'].loc['one'])
# a 0
# b 7
?
|