import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
file_path = "IMDB-Movie-Data.csv"
df = pd.read_csv(file_path)
#统计分类的列表
tem_list = df["Genre"].str.split(",").tolist()
genre_list = list(set([i for j in tem_list for i in j]))
#构造全为0的数组,.shape为统计列表中元素的个数,.shape[0]为第1行的个数
zeros_df = pd.DataFrame(np.zeros((df.shape[0],len(genre_list))),columns=genre_list)
#给每个电影出现分类的位置赋值为1
for i in range(df.shape[0]):
zeros_df.loc[i,tem_list[i]] = 1
#print(zeros_df.head(3))
#统计每个分类电影的数量
genre_count = zeros_df.sum(axis=0) #在横轴方向统计每一列的值
#print(genre_count)
#排序.sort_values()为使数据的值按照从小到大升序排列
genre_count = genre_count.sort_values()
#画图
plt.figure(figsize=(20,8),dpi=80)
#plt.bar(genre_list,genre_count) #genre_list没有跟随genre_count变化
_x = genre_count.index
_y = genre_count.values
plt.bar(range(len(_x)),_y)
plt.xticks(range(len(_x)),_x)
plt.show()
import pandas as pd
import numpy as np
df1 = pd.DataFrame(np.ones((2,4)),index=["A","B"],columns=list("abcd"))
print(df1)
df2 = pd.DataFrame(np.zeros((3,3)),index=["A","B","C"],columns=list("xyz"))
#print(df2)
#.join是行索引合并
#tem_test1 = df1.join(df2)
#print(tem_test1)
#tem_test2 = df2.join(df1)
#print(tem_test2)
#.merge是列索引合并
# df3 = pd.DataFrame(np.arange(9).reshape((3,3)),columns=list("fax"))
# print(df3)
# #df1.loc["A","a"]=100
# #print(df1) #如果对df1进行修改后,让df1的a列只有一个1,merge合并后只有1行
# print(df1.merge(df3,on="a")) #df1的a列在A,B这2行上都是1,因此指定a为标准进行合并时,有2行
# print(df1.index)
# df1.index = ["a","b"]
# print(df1)
# print("*"*100)
# print(df1.reindex(["a","f"])) #.reinde是从数据中抽出["a","f"],再赋值给数据,因为原数据有a这一行,所以a有值
# print(df1) #原数据没有f,f行没有数据可以抽取,因此显示为nan
# print("*"*100)
# print(df1.set_index("a"))
# print(df1.set_index("a").index)
# print("*"*100)
# print(df1.set_index("a",drop=False))
# print(df1["d"].unique())
# print(df1.set_index("a").index.unique())
# print(len(df1.set_index("a").index))
# print(df1.set_index(["a","b"]))
# print(df1.set_index(["a","b"]).index)
a = pd.DataFrame({"a":range(7),"b":range(7,0,-1),"c":["one","one","one","two","two","two","two"],"d":list("hjklmno")})
print(a)
b = a.set_index(["c","d"])
print(b)
c= b["a"]
print(c)
print(c["one"]["j"])
print("*"*100)
print(c["one"])
d = a.set_index(["d","c"])["a"]
print(d)
print(d.swaplevel())
print("*"*100)
print(b.loc["one"].loc["h"])
print(b.swaplevel().loc["h"])
import pandas as pd
import numpy as np
file_path = "./directory.csv"
df = pd.read_csv(file_path)
#print(df.head(1))
#print(df.info())
#grouped = df.groupby(by="Country")
#print(grouped)
#DataFrameGroupby可以遍历
#for i,j in grouped:
# print(i)
# print("-" * 100)
# print(j)
# print("*"*100)
#df[df["country"]="US"]
#可以聚合
#country_count = grouped["Brand"].count()
#print(country_count["US"])
#print(country_count["CN"])
#统计中国每个省份店铺的数量
#china_data = df[df["Country"] == "CN"]
#grouped = china_data.groupby(by="State/Province").count()["Brand"]
#print(grouped)
#按照多个条件进行分组 df["Brand"]取到的是series,不是DataFrame,当使用多个条件进行分组时,需要限定,如df["Country"]
#grouped = df["Brand"].groupby(by=[df["Country"],df["State/Province"]]).count() #输出的结果为series,但是有2个索引
#print(grouped)
#如果想返回一个DataFrame时,可以使用列表方式.grouped1与grouped2,grouped3的效果是一样的
grouped1 = df[["Brand"]].groupby(by=[df["Country"],df["State/Province"]]).count()
# grouped2 = df.groupby(by=[df["Country"],df["State/Province"]])[["Brand"]].count()
# # grouped3 = df.groupby(by=[df["Country"],df["State/Province"]]).count()[["Brand"]]
#索引的方法和属性
print(grouped1.index)
import pandas as pd
from matplotlib import pyplot as plt
file_path = "./directory.csv"
df = pd.read_csv(file_path)
data1 = df.groupby(by="Country").count()#["Brand"].sort_values(ascending=False)[:10]
print(data1)
# _x = data1.index
# _y = data1.values
# print(len(_x))
#
# print(range(len(_x)))
# plt.figure(figsize=(20,8),dpi=80)
#
# plt.bar(range(len(_x)),_y)
#
# plt.xticks(range(len(_x)),_x)
#
#
# plt.show()
|