开发: C++知识库 Java知识库 JavaScript Python PHP知识库人工智能区块链大数据移动开发嵌入式开发工具数据结构与算法开发测试游戏开发网络协议系统运维
教程: HTML教程 CSS教程 JavaScript教程 Go语言教程 JQuery教程 VUE教程 VUE3教程 Bootstrap教程 SQL数据库教程 C语言教程 C++教程 Java教程 Python教程 Python3教程 C#教程
数码: 电脑笔记本显卡显示器固态硬盘硬盘耳机手机 iphone vivo oppo 小米华为单反装机图拉丁

-> 人工智能 -> Python的Numpy与Pandas包的使用 -> 正文阅读

[人工智能]Python的Numpy与Pandas包的使用

1.NumPy基础使用

import numpy as np
import random
import matplotlib.pyplot as plt

# 安装 numpy, scipy, pandas
# numpy:  高级数组,线性代数中矩阵
# scipy:  科学计算基础公式库
# pandas:  数据分析工具
# matplotlib:  绘图

# 1.使用numpy创建一维数组 （list，tuple）
lst = [1, 2, 3, 4]  # 使用列表创建lst
tup = (1, 2, 3, 4, 5)  # 使用元组创建
# set = {1, 2, 3, 4, 5}  # 不能创建
# s = 'candle' # 不能创建
# dic = {"key1":"value1"}
arr = np.array(lst)
# print(arr[0])

# 2.创建连续区间数字的一维数组 np.arange（start,end,step）
# lst = [i for i in range(1, 21)]
# print(lst)
# arr = np.array(lst)
# print(arr)
# 练习1：产生连续数字1~20的数组（列表生成式，arange）
arr = np.arange(1, 21)
# print(arr)
# 练习2：产生1 4 7 … 16的数组
arr = np.arange(1, 17, 3)
# print(arr)
# 练习3：产生9~1的数组
arr = np.arange(9, 0, -1)
# print(arr)

# 4.创建随机数的数组（1-9范围内，数组中随机10个数字）
# lst = [ random.randint(1, 10) for i in range(1, 11)]
arr = np.random.randint(1, 10, size=10)
# print(arr)

# 5.	一维数组元素获取【index】 及查看元素类型dtype
# arr = np.array([1, 2, 3, 4, 5, 6], dtype=np.int8)
# print(arr[0].dtype)  # 默认是int32
# numpy int默认int32  8~128
# arr = np.array([1, 2, 3, 4, 5.4, 6])  # [1.  2.  3.  4.  5.4 6. ]
# print(arr[0].dtype)
# numpy float默认 float64   16~256
arr = np.array([1, 2, 3, 4.16, 'candle'])
# print(arr)
# print(arr[4].dtype)  # <U6  字符串类型 大小为6

# 6.查看维度ndim，元素个数size，数组形状shape
arr = np.arange(1, 10)
# print(arr.ndim)  # 查看数组维度
# print(arr.size)  # 查看数组中元素的个数
# print(arr.shape)  # 查看数组形状 每个维度大小


# 7.Numpy创建一个连续数字的二维数组（1~12）
# 多维数组,每个维度下,个数要一致
lst1 = [[1, 2, 3], (4, 5, 6), [7, 8, 9]]   # 二维数组
lst2 = [[1, 2, 3], (4, 5, 6), [7, 8, 9, 10]]  # 一维数组
# arr = np.array(lst2)
# print(arr)
# print(arr.shape)

# 1. 一维升维度后变成二维  reshape(x,y,z)
arr = np.arange(1, 13)  # 一维数组 有12个成员 升维度为二维数组 1*12 2*6 3*4 4*3 6*2 12*1
# print(arr.shape)  # (12,)
arr2d = arr.reshape(3, 4)  # 3行4列
# print(arr2d)
# print(arr2d.shape)
# 必须满足 行*列 = 个数
# arr2d = arr.reshape(6, 6)  # ValueError: cannot reshape array of size 12 into shape (6,6)
arr2d = arr.reshape(-1, 6)  # -表示自动匹配剩余的维度
# print(arr2d)

# arr2d = arr.reshape(-1, -1)  # ValueError: can only specify one unknown dimension

# 2.创建时指定维度(随机数数组)
# 8.产生1-9随机数的9个元素的二维数组（要求3*3）
arr = np.random.randint(1, 10, size=9)
# print(arr.reshape(3, 3))

arr = np.random.randint(1, 10, size=(3,3))
# print(arr)

arr = np.random.randint(1, 13, size=(2, 2, 3))
# print(arr)

# 9.	使用numpy创建矩阵
# 方法1： array()函数生成矩阵时数据只能为列表形式
arr = np.array([[1, 2, 3], [4, 5, 6]])
# print(arr)

brr = np.mat([[1, 2, 3], [4, 5, 6]])
# print(brr)

crr = np.mat("1 2 3;4 5 6")
# print(crr)


# 10.	创建特殊矩阵
arr = np.ones((3, 5), dtype=np.int8)
# print(arr)
arr = np.zeros((2, 2))
# print(arr)
arr = np.eye(3)
# print(arr)

# 练习2：模拟抛掷500次硬币，正面表示1，反面表示0
arr = np.random.randint(0, 2, size=500)
print(arr)

# 练习3：模拟抛掷500次硬币，正面表示1，反面表示-1
arr = np.where(arr == 0, -1, 1)
print(arr)

# 练习4：绘制随机漫步曲线（cumsum 数组累加求和）
# 对抛硬币结果进行累加求和
arr = np.cumsum(arr)
print(arr)

plt.plot(arr)
plt.title("my title")
plt.xlabel("my xlabel")
plt.ylabel("my cumsum")
plt.show()

其他使用

import numpy as np
import random

# 1  2  使用numpy创建一维数组
# 要使用list，tuple
lst = [1, 2, 3, 4]
tup = (1, 2, 3, 4, 5)

set = {1, 2, 3, 4, 5}
str = 'candle'
dict = {"lihua": 28, "zm": 18}

arr1 = np.array(lst)  # [1 2 3 4]
arr2 = np.array(tup)  # [1 2 3 4 5]

arr3 = np.array(set)
arr4 = np.array(str)
arr5 = np.array(dict)
print(arr1)  # {1, 2, 3, 4, 5}
print(arr2) # candle
print(arr3) # {'lihua': 28, 'zm': 18}
print(arr4)
print(arr5)
# 3.创建连续区间数字的一维数组 np.arange（start,end,step）
# 练习1：产生连续数字1~20的数组（列表生成式，arange）
lst2 = [i for i in range(1, 21)]
print(lst2)  # [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20]
arr6 = np.array(lst2)
print(arr6)  # [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]
# 练习2：产生1 4 7 … 16的数组
arr7 = np.arange(1, 17, 3)
print(arr7)  # [ 1  4  7 10 13 16]
# 练习3：产生9~1的数组
arr8 = np.arange(9, 0, -1)
print(arr8)  # [9 8 7 6 5 4 3 2 1]

# 4.创建随机数的数组（1-9范围内，数组中随机10个数字）
print("4***********")
lst9 = [random.randint(1, 10) for i in range(1, 11)]
print(lst9)  # [5, 5, 8, 8, 7, 8, 3, 3, 6, 10]
# np.random.randint(low,high,size,dtype)
arr9 = np.random.randint(1, 10, size=10)
print(arr9) # [5 7 9 4 4 4 4 4 8 3]

# 5.	一维数组元素获取【index】 及查看元素类型dtype
# (1)nunpy 的int类型  默认是int32 int8~int128
arr9 = np.array([1, 2, 3, 4, 5.4, 6], dtype=np.int8)
print(arr9[0].dtype)  # int8

# (2) numpy float默认  float64 16~256
# 发现有一个为浮点类型 5.4 其他全部转了
arr10 = np.array([1, 2, 3, 4, 5.4, 6])
print(arr10)  # [1.  2.  3.  4.  5.4 6. ]
print(arr10[0].dtype)  # float64

# (3)字符串类型
arr11 = np.array([1, 2, 3, 4.16, 'lihua'])
print(arr11) # ['1' '2' '3' '4.16' 'lihua']
print(arr11[2].dtype)  # <U1 大小为6   unicode编码
print(arr11[4].dtype)  # <U5  大小为5

# 6.查看维度ndim，元素个数size，数组形状shape\
arr = np.arange(1, 10)
print(arr)  # [1 2 3 4 5 6 7 8 9]
# 查看数组维度
print(arr.ndim)  # (9,)
# 查看数组中元素的个数
print(arr.size)  # 9
# 查看数组的星形状    行列 维度
print(arr.shape)  # (9,)

# 7.	Numpy创建一个连续数字的二维数组（1~12）

print('Numpy创建一个连续数字的二维数组（1~12）')
# 多维数组,每个维度下个数要一致(不然是一维数组)
# 二维数组
lst1 = [[1, 2, 3], (4, 5, 6), [7, 8, 9]]
arr = np.array(lst1)
print(arr.shape)  # (3, 3)   一维 有3个元素   每个二维里面有3个元素
print(lst1)  # [[1, 2, 3], (4, 5, 6), [7, 8, 9]]
# 一维数组
lst2 = [[1, 2, 3], [4, 5, 6], [7, 8, 9, 10]]
print(np.array(lst2).shape)  # (3,)
print(lst2)  # [[1, 2, 3], [4, 5, 6], [7, 8, 9, 10]]

# (1) A.一维升维度后变成二维  reshape(x,y,z)
arr = np.arange(1, 13)
print(arr)  # [ 1  2  3  4  5  6  7  8  9 10 11 12]
print(arr.shape)  # (12,)

arr2d = arr.reshape(3, 4)
print(arr2d.shape)  # (3, 4)
print(arr2d)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]

# B.-1 表示自动匹配维度
print("B.-1 表示自动匹配维度")
arr2 = arr.reshape(-1, 6)
print(arr2)
# [[ 1  2  3  4  5  6]
#  [ 7  8  9 10 11 12]]

arr3 = arr.reshape(3, -1)
print(arr3)
# [[ 1  2  3  4]
#  [ 5  6  7  8]
#  [ 9 10 11 12]]

# C.reshape(3,4) 参数相乘必须等于个数
# arr2=arr.reshape(2,4)
# print(arr2) # ValueError: cannot reshape array of size 12 into shape (2,4)

# (2)创建时指定维度(随机数数组)
# 直接再size中指定二维
arr2 = np.random.randint(1, 10, size=(3, 3))
print(arr2)
# [[1 6 6]
#  [7 9 4]
#  [6 4 5]]

# 8. 产生一个1-9随机数的9个元素的二维数组(要求3*3)
arr = np.random.randint(1, 10, size=9)
print(arr.reshape(3, 3))
# [[5 8 8]
#  [6 8 1]
#  [5 2 8]]

# 直接再size中指定二维
arr2 = np.random.randint(1, 10, size=(3, 3))
print(arr2)
# [[1 6 6]
#  [7 9 4]
#  [6 4 5]]

# 三维数组
arr3 = np.random.randint(1, 13, size=(2, 2, 3))
print(arr3)

# [[[ 6  7  4]
#   [10  1 11]]
#
#  [[ 1  8  9]
#   [ 3 11  9]]]

#9.使用numpy创建矩阵
print("9******")
#方法1： array()函数生成矩阵时数据只能为列表形式
arr=np.array([[1, 2], [3, 4]])
print(arr)
# [[1 2]
#  [3 4]]

#方法2 ： mat()函数生成矩阵时(字符串或列表形式)
brr=np.mat([[1, 2], [3, 4]])
print(brr)
# [[1 2]
#  [3 4]]
crr = np.mat("1 2;3 4")
print(crr)
# [[1 2]
#  [3 4]]

# 10.	创建特殊矩阵
# numpy.ones(shape, dtype)  产生全1矩阵
# numpy.zeros(shape, dtype)  产生全0矩阵
# numpy.eye(shape, dtype)  产生下对角线1矩阵

arr = np.ones((3, 5), dtype=np.int8)
print(arr)
# [[1 1 1 1 1]
#  [1 1 1 1 1]
#  [1 1 1 1 1]]
arr = np.zeros((2, 2))
print(arr)
# [[0. 0.]
#  [0. 0.]]
arr = np.eye(3)
print(arr)
# [[1. 0. 0.]
#  [0. 1. 0.]
#  [0. 0. 1.]]

# 11.	Numpy 常用替换函数
# np.where(condition, x, y)：满足条件(condition)，输出x，不满足输出y。
# np.where(condition)：满足条件返回给定数组索引


# 12.	Numpy 课堂练习
# 练习1：产生1-9之间的所有数字乱序的3*3二维数组
print("12*****")
arr9 = np.random.randint(1, 10, size=(3,3))
print(arr9)
# [[5 8 3]
#  [1 9 5]
#  [7 5 1]]
# 练习2：模拟抛掷500次硬币，正面表示1，反面表示0
arr=np.random.randint(0,2,size=500)
print(arr) # [0,1,0,1...]
# 练习3：模拟抛掷500次硬币，正面表示1，反面表示-1
arr=np.where(arr==0,-1,1)
print(arr) # [0,1,0,1...]   0全部变为-1,是1数据就不变
# 练习4：绘制随机漫步曲线（cumsum 数组累加求和）
#[1,-1,-1,-1,1,-1.....]
# 对抛硬币的结果进行累加求和
arr=np.cumsum(arr)
print(arr) # rleft  [0,-1,-2...]   1-1  0-1  -1-1  ...

import matplotlib.pyplot as plt
# 画图
plt.plot(arr)
# 设置图片的标题,x轴,y轴的标签
plt.title("my title")
plt.xlabel("my xlabel")
plt.ylabel("my ylabel")
# 在终端显示图片
# plt.show()

1.1 ndarry对象

import numpy as np

# python中没有原生数组数据类型,ndarray就是python的引用数据类型:数组
a = np.array([1, 2, 3])
print(a)  # [1 2 3]

# 多于一个维度

a = np.array([[1, 2], [3, 4]])
print(a)

# [[1 2]
# [3 4]]

# 最小维度

a = np.array([1, 2, 3, 4, 5], ndmin=2)
print(a)  # [[1 2 3 4 5]]

# dtype 参数

a = np.array([1, 2, 3], dtype=complex)
print(a)  # [1.+0.j 2.+0.j 3.+0.j]

1.2 numpy数据类型

import numpy as np

# 使用标量类型
dt = np.dtype(np.int32)
print(dt)  # int32

# int8, int16, int32, int64 四种数据类型可以使用字符串 'i1', 'i2','i4','i8' 代替
dt = np.dtype('i4')
print(dt)  # int32

# 字节顺序标注
dt = np.dtype('<i4')
print(dt)  # int32

# 首先创建结构化数据类型
dt = np.dtype([('age', np.int8)])
print(dt)  # [('age', 'i1')]   age的数据类型是int8

# 将数据类型应用于 ndarray 对象
dt = np.dtype([('age', np.int8)])
a = np.array([(10,), (20,), (30,)], dtype=dt)
print(a)  # [(10,) (20,) (30,)]  #数据类型是int8,代表int中的数据长度

# 类型字段名可以用于存取实际的 age 列
dt = np.dtype([('age', np.int8)])
a = np.array([(10,), (20,), (30,)], dtype=dt)
print(a['age'])  # [10 20 30]

student = np.dtype([('name', 'S20'), ('age', 'i1'), ('marks', 'f4')])
print(student)  # [('name', 'S20'), ('age', 'i1'), ('marks', '<f4')]  # 列表 [] 里面是元组  ()  每一个库就相当于一个框架

# 列表中的元组不同的代表不同元素name,age,marks的数据类型
student = np.dtype([('name', 'S20'), ('age', 'i1'), ('marks', 'f4')])
a = np.array([('abc', 21, 50), ('xyz', 18, 75)], dtype=student)
print(a)  # [(b'abc', 21, 50.) (b'xyz', 18, 75.)]

1.3 numpy数组属性

import numpy as np

#  ndarray.ndim 用于返回数组的维数，等于秩
a = np.arange(24)
print(a.ndim)  # a 现只有一个维度   1
# 现在调整其大小
b = a.reshape(2, 4, 3)  # b 现在拥有三个维度
print(b.ndim)  # 3

# darray.shape 表示数组的维度，返回一个元组，这个元组的长度就是维度的数目，即 ndim 属性(秩)。比如，一个二维数组，其维度表示"行数"和"列数"。
a = np.array([[1, 2, 3], [4, 5, 6]])
print(a.shape)  # (2, 3) 2行3列的数组

#  将数组调整成为3行2列的数组
a = np.array([[1, 2, 3], [4, 5, 6]])
a.shape = (3, 2)
print(a)

# [1,2,3]
# [4,5,6]

# [1 2]
# [3 4]
# [5 6]]

#  NumPy 也提供了 reshape(改造) 函数来调整数组大小。
a = np.array([[1, 2, 3], [4, 5, 6]])
b = a.reshape(3, 2)
print(b)

# [[1 2]
# [3 4]
# [5 6]]

#  ndarray.itemsize 以字节的形式返回数组中每一个元素的大小,单位字节。
# 数组的 dtype 为 int8（一个字节等于8位）
x = np.array([1, 2, 3, 4, 5], dtype=np.int8)
print(x.itemsize)  # 1

# 数组的 dtype 现在为 float64（八个字节等于64位）
y = np.array([1, 2, 3, 4, 5], dtype=np.float64)
print(y.itemsize)  # 8


#  ndarray.flags 返回 ndarray 对象的内存信息
x = np.array([1, 2, 3, 4, 5])
print(x.flags)
# C_CONTIGUOUS: True
# F_CONTIGUOUS: True
# OWNDATA: True
# WRITEABLE: True
# ALIGNED: True
# WRITEBACKIFCOPY: False
# UPDATEIFCOPY: False

1.4 numpy创建数组

import numpy as np

#  (1)numpy.empty 方法用来创建一个指定形状（shape）、数据类型（dtype）且未初始化的数组：
x = np.empty([3, 2], dtype=int)
print(x)  # 生成的3行2列的数组的值是随机的
#  注意 ? 数组元素为随机值，因为它们未初始化
# [[-1212020656       32764]
# [-1212016064       32764]
# [    6881397         100]]

# (2)创建指定大小的数组，数组元素以 0 来填充：
# 默认为浮点数
x = np.zeros(5)
print(x)  # [0. 0. 0. 0. 0.]

# 设置类型为整数
y = np.zeros((5,), dtype=np.int)
print(y)  # [0 0 0 0 0]

# 自定义类型 x ,y 的数据类型是int32
z = np.zeros((2, 2), dtype=[('x', 'i4'), ('y', 'i4')])
print(z)

# [[(0, 0) (0, 0)]
# [(0, 0) (0, 0)]]

# (3)创建指定形状的数组，数组元素以 1 来填充：
# 默认为浮点数
x = np.ones(5)
print(x)

# [1. 1. 1. 1. 1.]

# 自定义类型 2行2列的数组 初始值为1
x = np.ones([2, 2], dtype=int)
print(x)

# [[1 1]
#  [1 1]]

x = np.ones([3, 2], dtype=int)  # 3行两列的数组,初始值为1
print(x)

# [[1 1]
# [1 1]
# [1 1]]

1.5 numpy从已有的数据中创建数组

import numpy as np

#  将列表转换为 ndarray:
x = [1, 2, 3]
a = np.asarray(x)
print(a)  # [1 2 3]

#  将元组转换为 ndarray:
x = (1, 2, 3)
a = np.asarray(x)
print(a)  # [1 2 3]

#  将元组列表转换为 ndarray:
x = [(1, 2, 3), (4, 5)]
a = np.asarray(x)
print(a)  # [(1, 2, 3) (4, 5)]

#  设置了 dtype 参数：
x = [1, 2, 3]
a = np.asarray(x, dtype=float)
print(a)  # [1. 2. 3.]

#  numpy.frombuffer 用于实现动态数组。
# python 3
s = b'Hello World'
a = np.frombuffer(s, dtype='S1')
print(a)  # [b'H' b'e' b'l' b'l' b'o' b' ' b'W' b'o' b'r' b'l' b'd']

#  注意：buffer 是字符串的时候，Python3 默认 str 是 Unicode 类型，所以要转成 bytestring 在原 str 前加上 b。

# python 2
# s = 'Hello World'
# a = np.frombuffer(s, dtype='S1')
# print(a)

#  numpy.fromiter   方法从可迭代对象中建立 ndarray 对象，返回一维数组。
# 使用 range 函数创建列表对象
list = range(5)
it = iter(list)
print(it)  # <range_iterator object at 0x000001E45AD94F50>
# 使用迭代器创建 ndarray
x = np.fromiter(it, dtype=float)
print(x)  # [0. 1. 2. 3. 4.]

1.6 NumPy从数值范围创建数组

import numpy as np

# 1.numpy.arange  numpy 包中的使用 arange 函数创建数值范围并返回 ndarray 对象

#  生成 0 到 5 的数组:
x = np.arange(5)
print(x)  # [0 1 2 3 4]

#  设置返回类型位 float:
# 设置了 dtype
x = np.arange(5, dtype=float)
print(x)  # [0. 1. 2. 3. 4.]

#  设置了起始值、终止值及步长：
x = np.arange(10, 20, 2)
print(x)  # [10 12 14 16 18]

# 2.numpy.linspace  numpy.linspace 函数用于创建一个一维数组，数组是一个等差数列构成的

# 以下实例用到三个参数，设置起始点为 1 ，终止点为 10，数列个数为 10。
a = np.linspace(1, 10, 10)
print(a)  # [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10.]

# 设置元素全部是1的等差数列：
a = np.linspace(1, 1, 10)
print(a)  # [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]

# 将 endpoint(端点) 设为 false，不包含终止值：

a = np.linspace(10, 20, 5, endpoint=False)
print(a)  # [10. 12. 14. 16. 18.]

# 如果将 endpoint(端点) 设为 true，则会包含 20。

# 以下实例设置间距。 retstep =True 就是将等差数列的方差显示出来,默认是不显示.
a = np.linspace(1, 10, 10, retstep=True)
print(a)  # (array([ 1.,  2.,  3.,  4.,  5.,  6.,  7.,  8.,  9., 10.]), 1.0)
a = np.linspace(10, 20, 5, retstep=True)  # (array([10. , 12.5, 15. , 17.5, 20. ]), 2.5)   (10到20之间要有5个数,等差是2.5 )

print(a)
# 拓展例子  1行10列,重新变成10行1列
b = np.linspace(1, 10, 10).reshape([10, 1])
print(b)

'''
[ 1.]
 [ 2.]
 [ 3.]
 [ 4.]
 [ 5.]
 [ 6.]
 [ 7.]
 [ 8.]
 [ 9.]
 [10.]]
'''
#  3.numpy.logspace  numpy.logspace 函数用于创建一个于等比数列

# 默认底数是 10  base就是等比数列的q
a = np.logspace(1.0, 2.0, num=10)
print(a)
# 10^1(10. )  ......      10^2(100.)
# [ 10.          12.91549665  16.68100537  21.5443469   27.82559402
# 35.93813664  46.41588834  59.94842503  77.42636827 100.        ]

# 将对数的底数设置为 2 :
a = a = np.logspace(0, 9, 10, base=2)
print(a)

# 0 1 2 3 4 5 6 7 8 9
#   2^0  2^1  2^2  2^3  2^4  2^5  2^6  2^7  2^8  2^9
#  [  1.   2.   4.   8.  16.  32.  64. 128. 256. 512.]

1.7 numpy的切片和索引

import numpy as np

a = np.arange(10)
s = slice(2, 7, 2)  # 从索引 2 开始到索引 7 停止，间隔为2
print(a[s])  # [2 4 6]

#  我们也可以通过冒号分隔切片参数 start:stop:step 来进行切片操作：
a = np.arange(10)
b = a[2:7:2]  # 从索引 2 开始到索引 7 停止，间隔为 2
print(b)  # [2 4 6]

a = np.arange(10)  # [0 1 2 3 4 5 6 7 8 9]
b = a[5]
print(b)  # 5

a = np.arange(10)
print(a[2:])  # [2 3 4 5 6 7 8 9]

a = np.arange(10)  # [0 1 2 3 4 5 6 7 8 9]
print(a[2:5])  # [2 3 4]

a = np.array([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
print(a)
# java中的二维数组,java中的数组是用(),在python中的数组是用 [ ]
#  [[1 2 3]
#  [3 4 5]
#  [4 5 6]]


# 从某个索引处开始切割
print('从数组索引 a[1:] 处开始切割')
print(a[1:])
# [[3 4 5]
# [4 5 6]]


a = np.array([[1, 2, 3], [3, 4, 5], [4, 5, 6]])
print(a)
# [[1 2 3]
# [3 4 5]
# [4 5 6]]
# 下标还是从0开始的,1是第2个元素了 [ 行参数,列参数 ]
print(a[..., 1])  # 第2列元素    [2 4 5]
print(a[1, ...])  # 第2行元素    [3 4 5]
print(a[..., 1:])  # 第2列及剩下的所有元素
# [[2 3]
# [4 5]
# [5 6]]

1.8 numpy高级索引

# 1.整数数组索引
# 以下实例获取数组中(0,0)，(1,1)和(2,0)位置处的元素。

import numpy as np

x = np.array([[1, 2], [3, 4], [5, 6]])
y = x[[0, 1, 2], [0, 1, 0]]
print(y)  # [1 4 5]

# 以下实例获取了 4X3 数组中的四个角的元素。 行索引是 [0,0] 和 [3,3]，而列索引是 [0,2] 和 [0,2]。
x = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
print('我们的数组是：')
print(x)
print('\n')
'''
我们的数组是：
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
'''
rows = np.array([[0, 0], [3, 3]])
cols = np.array([[0, 2], [0, 2]])
y = x[rows, cols]

# [[0, 0], [3, 3]],[[0, 2], [0, 2]]

#  [ [0,0] [0,2]] [[3,0],[3,2]]

#    0     2        9    11
print('这个数组的四个角元素是：')
print(y)
# [[ 0  2]
# [ 9 11]]

# 可以借助切片 : 或 … 与索引数组组合。如下面例子：
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
b = a[1:3, 1:3]
c = a[1:3, [1, 2]]
d = a[..., 1:]
print(b)
print(c)
print(d)
# [1,2],[1,2]  -> [1,1] [1,2] [2,1] [2,2]
# [[5 6]
# [8 9]]

# [[5 6]
# [8 9]]

#  输出2,3 列
# [[2 3]
# [5 6]
# [8 9]]

#  2.布尔索引
# 以下实例获取大于 5 的元素：
x = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]])
print('我们的数组是：')
print(x)
print('\n')
'''
我们的数组是：
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
'''

# 现在我们会打印出大于 5 的元素   二维数组变成一维度数组了
print('大于 5 的元素是：')
print(x[x > 5])

# 大于 5 的元素是：
# [ 6  7  8  9 10 11]


# 以下实例使用了 ~（取补运算符）来过滤 NaN。
a = np.array([np.nan, 1, 2, np.nan, 3, 4, 5])
print(a[~np.isnan(a)])  # [1. 2. 3. 4. 5.]

# 以下实例演示如何从数组中过滤掉非复数元素。
a = np.array([1, 2 + 6j, 5, 3.5 + 5j])
print(a[np.iscomplex(a)])  # [2. +6.j 3.5+5.j]

# 3.花式索引

# (1)传入顺序索引数组
x = np.arange(32).reshape((8, 4))

print(x)  # 0-31的8行4列数组
'''
[[ 0  1  2  3]   -8
 [ 4  5  6  7]   -7
 [ 8  9 10 11]   -6
 [12 13 14 15]   -5
 [16 17 18 19]   -4
 [20 21 22 23]   -3
 [24 25 26 27]   -2
 [28 29 30 31]]  -1
'''
print(x[[4, 2, 1, 7]])  # 取上面相应的行

# [[16 17 18 19]
# [ 8  9 10 11]
# [ 4  5  6  7]
# [28 29 30 31]]

# (2)传入倒序索引数组
x = np.arange(32).reshape((8, 4))
print(x[[-4, -2, -1, -7]])
# [[16 17 18 19]
# [24 25 26 27]
# [28 29 30 31]
# [ 4  5  6  7]]


# (3)传入多个索引数组（要使用np.ix_）
x = np.arange(32).reshape((8, 4))
print(x[np.ix_([1, 5, 7, 2], [0, 3, 1, 2])])
# [1,0] 4
# [1,3] 7
# ....
# [2,2] 10


# [[ 4  7  5  6]
# [20 23 21 22]
# [28 31 29 30]
# [ 8 11  9 10]]

1.9 NumPY广播(Broadcast)

# 广播就是在进行数组的 + - *  逆的时候,可以自动补齐

#  4x3 的二维数组与长为 3 的一维数组相加，等效于把数组 b 在二维上重复 4 次再运算：

import numpy as np

a = np.array([[0, 0, 0],
              [10, 10, 10],
              [20, 20, 20],
              [30, 30, 30]])
b = np.array([1, 2, 3])
bb = np.tile(b, (4, 1))

# ([1, 2, 3],(4,1))
print(a + bb)

# [[ 1  2  3]
# [11 12 13]
# [21 22 23]
# [31 32 33]]

a = np.array([[0, 0, 0],
              [10, 10, 10],
              [20, 20, 20],
              [30, 30, 30]])
b = np.array([1, 2, 3])
print(a + b)

# [[ 1  2  3]
# [11 12 13]
# [21 22 23]
# [31 32 33]]

1.10 numpy迭代数组

import numpy as np

# a 和 a.T 的遍历顺序是一样的  默认按行访问
a = np.arange(6).reshape(2, 3)
for x in np.nditer(a.T):
    print(x, end=", ")  # 0, 1, 2, 3, 4, 5,
print('\n')
# a.T.copy(order='C') 这样设置就是按照列访问
for x in np.nditer(a.T.copy(order='C')):
    print(x, end=", ")  # 0, 3, 1, 4, 2, 5,
print('\n')

# a.T 是a的转置

1.11 numpy数组操作

# numpy.rollaxis   numpy.swapaxes  这个是不懂 ?????
# numpy.rollaxis numpy.rollaxis 函数向后滚动特定的轴到一个特定位置

import numpy as np

# 创建了三维的 ndarray
a = np.arange(8).reshape(2, 2, 2)

print('原数组：')
print(a)
print('\n')
# 将轴 2 滚动到轴 0（宽度到深度）

print('调用 rollaxis 函数：')
print(np.rollaxis(a, 2))  # np.rollaxis(a, 2,0)) 默认是0
# 将轴 0 滚动到轴 1：（宽度到高度）
print('\n')

print('调用 rollaxis 函数：')
print(np.rollaxis(a, 2, 1))
'''
原数组：
[[[0 1]
  [2 3]]

 [[4 5]
  [6 7]]]

[[[0 1] [2 3]] [[4 5][6 7]]]

调用 rollaxis 函数：
上面的数组  先选择 0 跳2个位置 2  跳2个位置 4  跳2个位置 6


[[[0 2]
  [4 6]]

 [[1 3]
  [5 7]]]


调用 rollaxis 函数：


[[[0 2]
  [1 3]]

 [[4 6]
  [5 7]]]

‘’’

1.13 NumPy5种常见函数

# 字符串函数  数学函数  算术函数  统计函数  排序条件筛选函数


# 统计函数
import numpy as np

a = np.array([[3, 7, 5], [8, 4, 3], [2, 4, 9]])
print('我们的数组是：')
print(a)
print('\n')
print('调用 amin() 函数：')
print(np.amin(a, 1))
print('\n')
print('再次调用 amin() 函数：')
print(np.amin(a, 0))
print('\n')
print('调用 amax() 函数：')
print(np.amax(a))
print('\n')
print('再次调用 amax() 函数：')
print(np.amax(a, axis=0))

'''
我们的数组是：
[[3 7 5]
 [8 4 3]
 [2 4 9]]


调用 amin() 函数：  np.amin(a, 1) 每行的最小值    行是轴1
[3 3 2] 


再次调用 amin() 函数： np.amin(a, 0) 每列的最小值   列是轴0 
[2 4 3]


调用 amax() 函数：  np.amax(a) 所有元素中的最大值
9


再次调用 amax() 函数：   np.amax(a, axis=0) 轴为0的最大值   每列的最大值 
[8 7 9]
'''

# numpy.percentile() 百分位数 中位数

1.17 numpy线性代数

import numpy as np
from matplotlib import pyplot as plt

# 1.
x = np.arange(1, 11)
y = 2 * x + 5
plt.title("Matplotlib demo")
plt.xlabel("x axis caption")
plt.ylabel("y axis caption")
plt.plot(x, y)
plt.show()

#  2.图形中文显示
import numpy as np
from matplotlib import pyplot as plt
import matplotlib

# fname 为 你下载的字体库路径，注意 SimHei.ttf 字体的路径
zhfont1 = matplotlib.font_manager.FontProperties(fname="SimHei.ttf")

x = np.arange(1, 11)
y = 2 * x + 5
plt.title("菜鸟教程 - 测试", fontproperties=zhfont1)

# fontproperties 设置中文显示，fontsize 设置字体大小
plt.xlabel("x 轴", fontproperties=zhfont1)
plt.ylabel("y 轴", fontproperties=zhfont1)
plt.plot(x, y)
plt.show()

# 3.要显示圆来代表点，而不是上面示例中的线，请使用 ob 作为 plot() 函数中的格式字符串。
import numpy as np
from matplotlib import pyplot as plt

x = np.arange(1, 11)
y = 2 * x + 5
plt.title("Matplotlib demo")
plt.xlabel("x axis caption")
plt.ylabel("y axis caption")
plt.plot(x, y, "ob")
plt.show()

# 4.绘制正弦波 以下实例使用 matplotlib 生成正弦波图

import numpy as np
import matplotlib.pyplot as plt

# 计算正弦曲线上点的 x 和 y 坐标
x = np.arange(0, 3 * np.pi, 0.1)
y = np.sin(x)
plt.title("sine wave form")
# 使用 matplotlib 来绘制点
plt.plot(x, y)
plt.show()

# 5.subplot()subplot(次要情节)() 函数允许你在同一图中绘制不同的东西。

import numpy as np
import matplotlib.pyplot as plt

# 计算正弦和余弦曲线上的点的 x 和 y 坐标
x = np.arange(0, 3 * np.pi, 0.1)
y_sin = np.sin(x)
y_cos = np.cos(x)
# 建立 subplot(次要情节) 网格，高为 2，宽为 1
# 激活第一个 subplot
plt.subplot(2, 1, 1)
# 绘制第一个图像
plt.plot(x, y_sin)
plt.title('Sine')
# 将第二个 subplot(次要情节) 激活，并绘制第二个图像
plt.subplot(2, 1, 2)
plt.plot(x, y_cos)
plt.title('Cosine')
# 展示图像
plt.show()

# 6.bar()
# pyplot 子模块提供 bar() 函数来生成条形图。
# 以下实例生成两组 x 和 y 数组的条形图。

from matplotlib import pyplot as plt

x = [5, 8, 10]
y = [12, 16, 6]
x2 = [6, 9, 11]
y2 = [6, 15, 7]
plt.bar(x, y, align='center')
plt.bar(x2, y2, color='g', align='center')
plt.title('Bar graph')
plt.ylabel('Y axis')
plt.xlabel('X axis')
plt.show()

# 7.numpy.histogram()
import numpy as np

a = np.array([22, 87, 5, 43, 56, 73, 55, 54, 11, 20, 51, 5, 79, 31, 27])
np.histogram(a, bins=[0, 20, 40, 60, 80, 100])
hist, bins = np.histogram(a, bins=[0, 20, 40, 60, 80, 100])
print(hist)
print(bins)

# 8.plt()
# Matplotlib 可以将直方图的数字表示转换为图形。 pyplot 子模块的 plt() 函数将包含数据和 bin 数组的数组作为参数，并转换为直方图。
from matplotlib import pyplot as plt
import numpy as np

a = np.array([22, 87, 5, 43, 56, 73, 55, 54, 11, 20, 51, 5, 79, 31, 27])
plt.hist(a, bins=[0, 20, 40, 60, 80, 100])
plt.title("histogram")
plt.show()

1.18 numpy IO

import numpy as np

a = np.array([1, 2, 3, 4, 5])

# 保存到 outfile.npy 文件上
np.save('outfile.npy', a)

# 保存到 outfile2.npy 文件上，如果文件路径末尾没有扩展名 .npy，该扩展名会被自动加上
np.save('outfile2', a)

b = np.load('outfile.npy')
print(b)  # [1 2 3 4 5]

a = np.array([[1, 2, 3], [4, 5, 6]])
b = np.arange(0, 1.0, 0.1)
c = np.sin(b)
# c 使用了关键字参数 sin_array
np.savez("runoob.npz", a, b, sin_array=c)
r = np.load("runoob.npz")
print(r.files)  # 查看各个数组名称  ['sin_array', 'arr_0', 'arr_1']
print(r["arr_0"])  # 数组 a [[1 2 3] [4 5 6] ]
print(r["arr_1"])  # 数组 b   [0.  0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9]
print(r["sin_array"])  # 数组 c
# [0.         0.09983342 0.19866933 0.29552021 0.38941834 0.47942554
# 0.56464247 0.64421769 0.71735609 0.78332691]

a = np.array([1, 2, 3, 4, 5])
np.savetxt('out.txt', a)
b = np.loadtxt('out.txt')

print(b)  # [1. 2. 3. 4. 5.]

a = np.arange(0, 10, 0.5).reshape(4, -1)
np.savetxt("out.txt", a, fmt="%d", delimiter=",")  # 改为保存为整数，以逗号分隔
b = np.loadtxt("out.txt", delimiter=",")  # load 时也要指定为逗号分隔
print(b)  # [[0. 0. 1. 1. 2.]
# [2. 3. 3. 4. 4.]
# [5. 5. 6. 6. 7.]
# [7. 8. 8. 9. 9.]]

2.Numpy高级用法

import numpy as np
import random

# 1.切片 和序列一致
arr = np.arange(3, 15)
print(arr[2:4])
print(arr[-1])
print(arr[::2])

brr = arr
# print(brr)
print(id(brr), id(arr))
brr = arr[:]  # 拷贝 arr[:]返回一个新的数组
# print(brr)
print(id(brr), id(arr))

# 2.高级切片 布尔类型切片
print("*"*100)
print(arr > 6)  # arr > 6 数组中每个成员的值和表达运算 ,结果生成一个新的数组
print(arr[arr > 6])  # 为True成员保留  为False删除
print(arr[arr != 9])
# 切片 大于9 小于12   & | ~
print(arr[(arr > 9) & (arr < 12)] )

#3.	Numpy向量运算（）
print("*"*100)
arr = np.array([1, 2, 3, 4, 5])
brr = np.array([3, 1, 4, 1, 5])
# print(arr + brr)  # 两个数组运算,对应的成员进行运算 两个数组shape相同时候才可以运算

# 4. Numpy统计方法(轴方向axis : 0,1)（sum,max,min,argmax,mean,cumsum,comprod）
#  针对二维数组
arr = np.arange(1, 10).reshape(3, 3)
print(arr)
# print(arr.sum())  # 没有设置轴方向,把所有的成员进行累加
print(arr.sum(axis=0))  # 求每一列的总和   [12 15 18]
print(arr.sum(axis=1))  # 求每一行总和   [6 15 24]
print(arr.max(axis=0))
print(arr.max(axis=1))
print(arr.mean(axis=0))
print(arr.argmax())  # 找出最大值所在位置
print(arr.cumsum())  # 累加求和
print(np.cumprod(arr, axis=1))
print("*"*100)

# arr = np.arange(1, 13).reshape(2,2, 3)
# print(arr)
# print(arr.sum(axis=2))


# 5.Numpy去重 np.unique(迭代对象)
# arr = np.array([3, 1, 4, 1, 5, 3, 5])
arr = np.array([[1, 2, 3, 4], [5, 5, 6, 6]])
print(arr)
print(np.unique(arr))  # [1 2 3 4 5 6]
arr = np.array([1.8, 2.6, 3.1])
print(np.ceil(arr))  # 向上取整
print(np.floor(arr))  # 向下取整
# 四舍五入?

# 练习1: 产生1-9随机数，要求3*3二维数组
arr = np.random.randint(1, 10, size=(3, 3))
# print(arr)
# 练习2: 统计每列中大于6的元素个数
# print(arr>6)
# brr = arr > 6

# 练习3：统计一行中三个值都是大于3的行的个数
print("*"*100)
print(arr)
# crr = arr>3
# drr = crr.sum(axis=1)
# err = drr == 3
# print(err.sum())
#  all表示所有必须都满足时候才返回True
brr = arr > 3
print(brr)
print(brr.all(axis=1))
print(brr.all(axis=1).sum())

# 创建一个由1~9随机生成 3*3二维数组,每个数组只出现一次
arr = np.arange(1, 10)
random.shuffle(arr)  # 把arr数组随机排列
print(arr)
print(arr.reshape(3, 3))

其他使用

import numpy as np
# 1.Numpy 普通切片  和序列一致
arr = np.arange(1, 10)
print(arr)  # [1 2 3 4 5 6 7 8 9]
print(arr[2:4])  # [3 4]
print(arr[-1])  # 9
print(arr[::2])  # [1 3 5 7 9]  从前到后, 步长为2

brr = arr
print(id(brr))  # 31289872
brr = arr[:]
print(id(brr))  # 31290112   返回的是一个新的序列

a = np.arange(1, 10).reshape(3, 3)
print(a)
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]
print(a[2])
# [7 8 9]
print(a[:2])
# [[1 2 3]
#  [4 5 6]]
print(a[2, 1])   # 二维数组就是就坐标   坐标确定   在行位0轴的位置在2  1轴的位置在1
# 8
print(a[:2, :1]) # 两个维度上切片
# [[1]
#  [4]]
print(a[1, :2]) # 一个维度索引,一个维度切片
# [4 5]
print(a[:, :1])
# [[1]
#  [4]
#  [7]]
a[:2, :1] = 0  # 切片赋值

# .2.	Numpy 花式切片 (范围,  & |  !，布尔索引)
print("*" * 100)
# (1)布尔索引   arr>6 生成一个新的数组,数组中的每个成员和表达式运算的结果
arr = np.arange(3, 10)
print(arr)  # [3 4 5 6 7 8 9]
print(arr > 6)  # [False False False False  True  True  True]
print(arr[arr > 6])  # [7 8 9]   为True成员保留,为False成员删除
print(arr[arr != 3])  # [4 5 6 7 8 9]  等于3的数字被删除

# 切片 大于4  小于 8         & |  !
# print(arr[arr>4 and arr<8])   报错
# print(arr[arr>4 & arr<8])  # 没括号，
print(arr[(arr > 4) & (arr < 8)])  # [5 6 7]

# 3 Numpy向量运算（）
print("3****")
arr = np.array([1, 2, 3, 4, 5])
brr = np.array([1, 2, 3, 4, 5])
crr = np.array([1, 2, 3, 4, 5, 6])

print(arr + 2)  # [3 4 5 6 7]
print(arr + brr)  # [ 2  4  6  8 10]
# print(arr+crr)  # 元素个数相同

# 4  	Numpy统计方法(轴方向axis : 0,1)
# （sum,max,min,argmax,mean,cumsum,comprod）
print("4*****")
arr = np.arange(1, 10).reshape(3, 3)
print(arr)
# [[1 2 3]
#  [4 5 6]
#  [7 8 9]]
print(arr.sum())  # 45
print(arr.sum(axis=0))
# [12 15 18]
print(arr.sum(axis=1))
# [ 6 15 24]

print(a.max(axis=0))  # [7 8 9]
print(a.min())  # 0
print(a.mean(axis=1))  # [1.66666667 3.66666667 8.        ]
print(a.std())  # 3.1661792997277796
print(a.var())  # 10.024691358024691
print(a.cumsum())  # [ 0  2  5  5 10 16 23 31 40]
print(a.cumsum(axis=1))  # [ 0  2  5  5 10 16 23 31 40]
print(a.cumprod())  # [0 0 0 0 0 0 0 0 0]
# 找出最大的最在位置
print(arr.max())  # 最大值 9
print(arr.argmax())  # 8    下标从0开始

# 三维数组
print('***三维数组***')
arr = np.arange(1, 13).reshape(2, 2, 3)
print(arr)
# [[[ 1  2  3]
#   [ 4  5  6]]
#
#  [[ 7  8  9]
#   [10 11 12]]]

print(arr.sum(axis=0))
# [[ 8 10 12]
#  [14 16 18]]
print(arr.sum(axis=1))
# [[ 5  7  9]
#  [17 19 21]]
print(arr.sum(axis=2))
# [[ 6 15]
#  [24 33]]

# 5.Numpy去重 np.unique(迭代对象)
arr = np.array([1, 2, 3, 2, 3, 4, 5, 6, 4])
brr = np.array([[1, 2, 3, 4], [1, 2, 2, 3]])
print(arr)  # [1 2 3 2 3 4 5 6 4]
print(np.unique(arr))  # [1 2 3 4 5 6]

print(brr)
print(np.unique(brr))

# 浮点类型的操作
arr = np.array([1.2, 2.6, 3.1])
print(np.ceil(arr))  # [2. 3. 4.]
print(np.floor(arr))  # [1. 2. 3.]
print(np.round(arr))  # [1. 3. 3.]

# 练习1: 产生1-9随机数，要求3*3二维数组
arr = np.random.randint(1, 10, size=(3, 3))
print(arr)
# [[4 2 2]
#  [8 3 8]
#  [6 5 7]]
# 练习2: 统计每列中大于6的元素个数
brr=arr>6
print(brr)
# [[False False False]
#  [ True False  True]
#  [False False  True]
print(brr.sum(axis=0)) # [1 0 2]

# 练习3：统计一行中三个值都是大于3的行的个数
# 练习3 方法1
arr = np.random.randint(1, 10, size=(3, 3))
print(arr)
crr=arr>3
print(crr)
# [[ True False  True]
#  [ True  True False]
#  [ True  True  True]]
drr=crr.sum(axis=1)
print(drr) # [2 2 3]
print(drr==3)  # [False False  True]
print(drr[drr==3].sum()/3)

# 练习3 方法2
print('练习3********')
crr=arr>3
drr=crr.sum(axis=1)
err = drr==3
print(crr)
# [[ True  True False]
#  [ True  True  True]
#  [ True  True  True]]
print(drr) # False 为0 True为1 求和
# [2 2 3]
print(err)
# [False  True  True]
print(err.sum()) # False 为0 True为1 求和
# 2

# 练习3 简单方法  调用all 表示所有必需满足 才返回True
arr = np.random.randint(1, 10, size=(3, 3))
print(arr)
# [[9 8 7]
#  [2 2 2]
#  [5 5 2]]
brr=arr>3
print(brr)
# [[ True  True  True]
#  [False False False]
#  [ True  True False]]
print(brr.all(axis=1))   # 每一行为true 就为true    这里的axis=1
# [ True False False]
print(brr.all(axis=1).sum()) # 1


# 练习4 :创建一个由1-9随机生成 3*3 二维数组,每个数字只出现一次
print('练习4***********')
arr=np.arange(1,10)
print(arr) # [1 2 3 4 5 6 7 8 9]
# 把arr数组随机排列
np.random.shuffle(arr)
print(arr) # [7 4 9 5 3 2 8 6 1]
print(arr.reshape(3,3))
# [[7 4 9]
#  [5 3 2]
#  [8 6 1]]

3.Pandas

3.1 Pandas的使用Series序列

# 序列
import pandas as pd
import numpy as np

# 1.创建序列
lst = [1, 2, 3, 4, 5]
tup = (4, 5, 6)
dic = {'name':'candle', 'age':18}  # 字典的键就是序列行索引
arr = np.array(lst)
pd.Series(arr)
# 2.自定义所以
se = pd.Series(range(1, 5), index=list('abcd'))  # 索引的个数要和元素的个数要一致
# 索引可以相同,创建时候没有问题,取值的时候回有问题
print(se)
se = pd.Series(range(1, 10))
se.index = list("abcdefghi")
print(se)
#print(se.values)  # 获取序列中的值    序列中的值其实就是一维数组 [1 2 3 4]
#print(list(se.index))  # 获取序列索引

# 4.获取索引中的值
print(se[2])  # 可以通过默认索引获取元素
print(se['c'])  # 通过指定索引

# 5 花式索引
# 切片  默认数字索引
print(se[2:])
# 索引列表 [[ ]]
lst = ['a', 'e', 'f']
print(se[['a', 'e', 'f']])
# bool类型数据 se[se > 5]
print(se[se > 5])
print('e' in se.index)  # 判断序列中是否存在指定索引： idx in 序列.index

# 分类统计
se = pd.Series(['aa', 'bb', 'cc', 'aa', 'aa', 'cc'])
print(se.value_counts())  # 默认降序
print(se.value_counts(ascending=True))

# 检测缺失值
se = pd.Series(['1', '2', '3', '4', '5'], index=list('ABCDE'))
se[1] = None
se[3] = np.nan
print(se)

print(se.dropna())  # dropna把元素为(None NaN)剔除





# lst = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
# print(pd.DataFrame(lst))
#     id age name
# 1
# 2
# 3

2_1 pandas的使用_Series(序列)

import pandas as pd
import numpy as np
# Python最强大数据分析工具，包含了两种重要的数据类型：Series, DataFrame
# 1. Series：序列 由索引列+值      (类似字典)
# (1) 序列的定义 s = pd.Series(data, index)
# 	data为列表，元组或字典（key是行索引）   numpy中只能用列表和元组创建

lst=[1,2,3,4,5]
print(pd.Series(lst))

# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# dtype: int64

tup=(4,5,6)
print(pd.Series(tup))
# 0    4
# 1    5
# 2    6
# dtype: int64

# 序列的键就是行的索引
dic={"lihua":18,"zm":10}
print(pd.Series(dic))
# lihua    18
# zm       10
# dtype: int64


arr=np.array([1,2,3,4,5])
print(pd.Series(arr))
# 0    1
# 1    2
# 2    3
# 3    4
# 4    5
# dtype: int32


lst2=[[1,2,3],[4,5,6],[7,8,9]] # 类似表格
print(pd.DataFrame(lst2))
#    0  1  2
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9


# (2)默认索引从0开始，可以自定义索引
# (2)_1
print(type(list('abcd'))) # <class 'list'>
se=pd.Series(range(1,5),index=list('abcd'))
print(se)
# a    1
# b    2
# c    3
# d    4
# dtype: int64

#a.自定义索引必须与数字个数一致
# se=pd.Series(range(1,5),index=list('abcde'))
#     .format(val=len(data), ind=len(index)))
# ValueError: Length of passed values is 4, index implies 5

#b.自定义索引相同,创建的时候没有问题,取值的时候有问题
se=pd.Series(range(1,5),index=list('aade'))
print(se)
# a    1
# a    2
# d    3
# e    4
# dtype: int64

#(2)_2
se=pd.Series(range(1,5))
print(se)
# 0    1
# 1    2
# 2    3
# 3    4

# list('abcd') 这个是list的声明   dic()  tuple()
se.index=list('abcd')
print(se)
# a    1
# b    2
# c    3
# d    4

#获取序列的索引
print(se.index)
# Index(['a', 'b', 'c', 'd'], dtype='object')
# 获取序列中的值  序列中的值实际上就是一维数组
print(se.values)
# [1 2 3 4]


# 2. 通过索引获取序列中的值 （默认的索引或自定义索引）
# (1)可以通过系统默认的索引,获取元素

# a    1
# b    2
# c    3
# d    4
print(se[2]) # 3

print(se[2:])
# c    3
# d    4
print(se['c']) # 3

# 3 分别查看索引index和值values
# list('abcd') 这个是list的声明   dic()  tuple()
se.index=list('abcd')
print(se)
# a    1
# b    2
# c    3
# d    4

#获取序列的索引
print(se.index)
# Index(['a', 'b', 'c', 'd'], dtype='object')
# 获取序列中的值  序列中的值实际上就是一维数组
print(se.values)
# [1 2 3 4]

#4. 花式索引：
# (1)序列[index]   index是列表（可以筛选多列值）
se=pd.Series(range(1,5),index=list('abcd'))
print(se)
# a    1
# b    2
# c    3
# d    4
print(se[2:])
# c    3
# d    4

# (2)列表索引 []
lst=['a','b']
print(se[lst])
# a    1
# b    2

print(se[['a','c']])
# a    1
# c    3

# (3)布尔索引
print(se>2)
# a    False
# b    False
# c     True
# d     True
print(se[se>2])
# c    3
# d    4

# (4)只会对元素去重，序列.unique()
se2=pd.Series(np.random.randint(1,10),index=list('abcd'))
print(se2)
print(se2.unique())

# (5)判断序列中是否存在指定索引： idx in 序列.index
print('e' in se.index) # False


# 5.统计元素出现的次数：  # 大数据 分布式才是核心   1个节点数据存不下,处理不了
	# s.value_counts() 分类统计（默认为降序）
	#s.value_counts(ascending=True)
se=pd.Series(['aa','bb','cc','aa','bb','dd','aa'])
print(se.value_counts()) # 默认降序
# aa    3
# bb    2
# dd    1
# cc    1
print(se.value_counts(ascending=True))
# cc    1
# dd    1
# bb    2
# aa    3

# 6> ***检测是否有缺失值：  None  np.nan
# s.isna() 返回bool   s.isna 将nan的元素直接删除后返 s.isnull()返回bool
# s.isnull将NULL的元素直接删除后返回

se=pd.Series(['1','2','3','4','5'],index=list('ABCDE'))
se[1]=None
se[3]=np.nan
print(type(None))  # <class 'NoneType'>
print(type(np.nan)) # <class 'float'>
# s.isna() 返回bool   s.isna 将nan的元素直接删除后返回
# s.isnull()返回bool  s.isnull将NULL的元素直接删除后返回

print(se)
# A       1
# B    None
# C       3
# D     NaN
# E       5
print(se.isna())
# A    False
# B     True
# C    False
# D     True
# E    False
print(se.isnull())
# A    False
# B     True
# C    False
# D     True
# E    False

# 包括None 和 np.nan的全部剔除
print(se.dropna())
# A    1
# C    3
# E    5

3.2 Panads的使用DataFram

# 3.1 DataFram基础知识
import pandas as pd
import numpy as np

# 1.DataFrame结构类似于数据库表结构的数据结构，其含有行索
lst = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
# dic = {"name":"candle", "age":18, "addr":"上海"}  # 字典键是DF列索引
arr = np.arange(1, 21).reshape(5, 4)
df = pd.DataFrame(arr, index=list('abcde'), columns=list('ABCD'))
print(df)

# 2.获取单列数据
# print(df.A)
# print(df['A'])

# 3.获取多列数据
# print(df[['A', 'B', 'D']])

# 4.新增一列
df['E'] = [5, 6, 7, 8, 9]
# print(df)
# print(df['A'])
# print(df[:2])

# 5.获取一行的数据
# print(df.ix[['a', 'd']])  # 获取a, e行数据
# print(df.ix['c':'e'])  # 左闭右闭 获取从c d e三行数据
# print(df[1:4])  # 数字切片:左闭右开
df.ix['f'] = [10, 20, 30, 40, 50]
# 6.增加一行数据
# print(df)

# 7.查找单元格的元素 df.ix[行索引，列索引]
# print(df['c', 'A'])  #
# print(df.ix['c', 'A'])
# print(df.ix[['c', 'f'], ['B', 'C']])

# 8. 删除列，删除行（返回被删除指定列行后的dataframe）删除之后生成新的df
# print(df.drop('a', axis=0))  # 删除行
# print(df.drop('B', axis=1))  # 删除列
# 删除多行 或者删除多列
# print(df.drop(['b', 'c', 'e'], axis=0))
# print(df.drop(['A', 'D'], axis=1))

# 9 查看前后数据
# print(df.ix[:3])  # 前三行数据
# print(df.head(3))
# print(df.tail(3))
# print(list(df.index))  # 获取行索引
# print(list(df.columns)) # 获取列索引
# print(df.values)  # 获取DataFrame中值 二维数组
# print(df.shape)  # 输出DataFrame shape
# print(df.describe())  # 列的统计

# 10 排序和转置
arr = np.arange(1, 21).reshape(5, 4)
df = pd.DataFrame(arr, index=list('cadbf'), columns=list('DBCA'))
# 排序
print(df)

# 行/列索引排序（axis控制行列，ascending控制升降序）
# print(df.sort_index(axis=0, ascending=False))  # 默认按照行排序
# print(df.sort_index(axis=1))
#
# print(df.sort_values(by= 'A', ascending=False))  # 按照索引A列的数据排序
#
# print(df.T)  # 行和列倒置

# 11 从csv格式文件
df = pd.read_csv('users.csv')
print(df)

2_2 pandas的使用_DataFrame

import pandas as pd
import numpy as np

# 2. DataFrame结构类似于数据库表结构的数据结构，其含有行索引index和列索引columns

# 1> 创建：列表，数组，字典（name,age,gender,tel）
#	NB: 字典的key为列索引，值应该为多行数
lst = [[1, 2, 3], [4, 5, 6], [7, 8, 9]]
print(pd.DataFrame(lst))
#    0  1  2
# 0  1  2  3
# 1  4  5  6
# 2  7  8  9

print(pd.DataFrame(lst, index=list('abc'), columns=list('123')))
#    1  2  3
# a  1  2  3
# b  4  5  6
# c  7  8  9

# 字典的键默认是列索引
dic = {"name": "lihua", "age": 18, "addr": "上海"}
print(pd.DataFrame(dic, index=list("abc")))
#     name  age   addr
# a  lihua   18   上海
# b  lihua   18   上海
# c  lihua   18   上海

# 2> 获取单列数据
# 	df.列名     df[‘列名’]
arr = np.arange(1, 21).reshape(5, 4)
df = pd.DataFrame(arr, index=list('abcde'), columns=list("ABCD"))
print(df)
#     A   B   C   D
# a   1   2   3   4
# b   5   6   7   8
# c   9  10  11  12
# d  13  14  15  16
# e  17  18  19  20
print(df.A)
# a     1
# b     5
# c     9
# d    13
# e    17
print(df['A'])
# a     1
# b     5
# c     9
# d    13
# e    17

# 3> 获取多列数据
# 	df[[列名的列表]]
print(df[['A', 'B', 'D']])
#     A   B   D
# a   1   2   4
# b   5   6   8
# c   9  10  12
# d  13  14  16
# e  17  18  20

# 4> 新增一列数据
# 	df[新列名] = 3  （这一列所有值都为3）
# 	df[新列名] = [1, 2, 3]  （依次给这一列的每一行赋值）

df['E'] = [5, 6, 7, 8, 9]
print(df)

#     A   B   C   D  E
# a   1   2   3   4  5
# b   5   6   7   8  6
# c   9  10  11  12  7
# d  13  14  15  16  8
# e  17  18  19  20  9
print(df['A'])  # 获取的是指定列
# b     5
# c     9
# d    13
# e    17
print(df[:2])  # 获取的是行 索引为 0 1 位置的行
#    A  B  C  D  E
# a  1  2  3  4  5
# b  5  6  7  8  6

# 5> 获取单行数据
# 	df.ix[行索引]  通用方法默认索引或自定义索引
# 获取b行的数据
print(df.ix['b'])
# A    5
# B    6
# C    7
# D    8
# E    6

# 6> 获取多行连续数据及间断数据
# 	df.ix[[行索引列表]]
# 	df[0:3]  左闭右开   df[‘a’:’c’]  连续切片，两闭
# 获取a行和d行的数据
print(df.ix[['a', 'd']])
#     A   B   C   D  E
# a   1   2   3   4  5
# d  13  14  15  16  8

# 指定了字符串的切片    左闭右闭 获取从c d  e三行的
print(df.ix['c':'e'])
#     A   B   C   D  E
# c   9  10  11  12  7
# d  13  14  15  16  8
# e  17  18  19  20  9
# 数字切片:左闭右开
print(df[1:4])
#     A   B   C   D  E
# b   5   6   7   8  6
# c   9  10  11  12  7
# d  13  14  15  16  8
# 7> 新增单行数据
# 	df.ix[行索引] = ['google', 20, 'M', '1686666']
df.ix['f'] = [10, 20, 30, 40, 50]
print(df)
#     A   B   C   D   E
# a   1   2   3   4   5
# b   5   6   7   8   6
# c   9  10  11  12   7
# d  13  14  15  16   8
# e  17  18  19  20   9
# f  10  20  30  40  50


# 8> 删除列，删除行（返回被删除指定列行后的dataframe）
# 	df.drop(行索引)  默认删除的就是行索引
# 	df.drop(列索引,axis=1)  删除一列值   知道轴的方向
# 删除行
print(df.drop('a', axis=0))
#     A   B   C   D   E
# b   5   6   7   8   6
# c   9  10  11  12   7
# d  13  14  15  16   8
# e  17  18  19  20   9
# f  10  20  30  40  50
# 删除列
print(df.drop('B', axis=1))
#     A   C   D   E
# a   1   3   4   5
# b   5   7   8   6
# c   9  11  12   7
# d  13  15  16   8
# e  17  19  20   9
# f  10  30  40  50

# 删除多行
print(df.drop(['a', 'b', 'd']))
#     A   B   C   D   E
# c   9  10  11  12   7
# e  17  18  19  20   9
# f  10  20  30  40  50
# 删除多列
print(df.drop(['A', 'E'], axis=1))
#     B   C   D
# a   2   3   4
# b   6   7   8
# c  10  11  12
# d  14  15  16
# e  18  19  20
# f  20  30  40

# 9> 锁定某个具体的元素
# 	df.ix[行索引，列索引]   也可以使用切片
print(df.ix['c', 'A'])  # 9
print(df.ix[['c', 'd'], ['A', 'B']])
#     A   B
# c   9  10
# d  13  14


# 3. DataFrame常用的查看方法
# 1> 返回前几条 df.head(num)
print(df.ix[:3])
print(df.head(3))
#    A   B   C   D  E
# a  1   2   3   4  5
# b  5   6   7   8  6
# c  9  10  11  12  7


# 2> 返回后几条 df.tail(num)
print(df.tail(3))
#     A   B   C   D   E
# d  13  14  15  16   8
# e  17  18  19  20   9
# f  10  20  30  40  50

# 3> 获取行索引indx,获取列索引columns
print(list(df.index))  # ['a', 'b', 'c', 'd', 'e', 'f']
print(list(df.columns))  # ['A', 'B', 'C', 'D', 'E']

# 4> 获取dataframe中的值values，是二维数组
print(df.values)
# [[ 1  2  3  4  5]
#  [ 5  6  7  8  6]
#  [ 9 10 11 12  7]
#  [13 14 15 16  8]
#  [17 18 19 20  9]
#  [10 20 30 40 50]]
# 输出DataFram shape
print(df.shape)  # (6, 5)   6行  5列

# 5> 查看dataframe数据的分布情况： df.describe()   按列统计的数据  axis=1方向
print(df.describe())
#                A          B          C          D          E
# count   6.000000   6.000000   6.000000   6.000000   6.000000
# mean    9.166667  11.666667  14.166667  16.666667  14.166667
# std     5.671567   6.976150   9.600347  12.754084  17.611549
# min     1.000000   2.000000   3.000000   4.000000   5.000000
# 25%     6.000000   7.000000   8.000000   9.000000   6.250000
# 50%     9.500000  12.000000  13.000000  14.000000   7.500000
# 75%    12.250000  17.000000  18.000000  19.000000   8.750000
# max    17.000000  20.000000  30.000000  40.000000  50.000000

# 6> 行/列索引排序（axis控制行列，ascending控制升降序）
# 	df = df.sort_index() 默认按照行索引字典升序
# 	df = df.sort_index(axis=1) 按照列索引字典升序
arr = np.arange(1, 21).reshape(5, 4)
df = pd.DataFrame(arr, index=list("cbaed"), columns=list("DBAC"))
print(df)
#    D   B   A   C
# c   1   2   3   4
# b   5   6   7   8
# a   9  10  11  12
# e  13  14  15  16
# d  17  18  19  20
print(df.sort_index())
#     D   B   A   C
# a   9  10  11  12
# b   5   6   7   8
# c   1   2   3   4
# d  17  18  19  20
# e  13  14  15  16
print(df.sort_index(axis=0,ascending=False))
#     D   B   A   C
# e  13  14  15  16
# d  17  18  19  20
# c   1   2   3   4
# b   5   6   7   8
# a   9  10  11  12

print(df.sort_index(axis=1))
#     A   B   C   D
# c   3   2   4   1
# b   7   6   8   5
# a  11  10  12   9
# e  15  14  16  13
# d  19  18  20  17

# 7> 行/列索引排序（axis控制行列，ascending控制升降序）
# 	df.sort_values(by="A") 单列值升序
# 	df.sort_values(by="A",ascending=False) 单列值降序
# 	df.sort_values(by=['A', 'B']) 多列排序
# 按索引A列的数据排序
print(df.sort_values(by="A"))
#     D   B   A   C
# c   1   2   3   4
# b   5   6   7   8
# a   9  10  11  12
# e  13  14  15  16
# d  17  18  19  20
print(df.sort_values(by="A",ascending=False))
#     D   B   A   C
# d  17  18  19  20
# e  13  14  15  16
# a   9  10  11  12
# b   5   6   7   8
# c   1   2   3   4

# 8> 矩阵转置 df.T   行与列改变
print(df.T)
#    c  b   a   e   d
# D  1  5   9  13  17
# B  2  6  10  14  18
# A  3  7  11  15  19
# C  4  8  12  16  20

#9>读取csv文件中
print(pd.read_csv('users.csv'))

3.2.1 DataFrame处理缺失数据

import numpy as np
import pandas as pd



# 5. DataFrame处理缺失数据

# 使用pandas创建如下格式的数据：
#
# df.ix[0:1, 'D']
# df.ix[3, 'C']


arr=np.arange(1,21).reshape(5,4)
df=pd.DataFrame(arr,columns=list("ABCD"))
print(df)
#     A   B   C   D
# 0   1   2   3   4
# 1   5   6   7   8
# 2   9  10  11  12
# 3  13  14  15  16
# 4  17  18  19  20

df.ix[3,'C']=np.nan
df.ix[[0,1],'D']=np.nan
print(df)
#     A   B     C     D
# 0   1   2   3.0   NaN
# 1   5   6   7.0   NaN
# 2   9  10  11.0  12.0
# 3  13  14   NaN  16.0
# 4  17  18  19.0  20.0

# (1)删除nan的行
print(df.dropna())
print(df.dropna(axis=0))
#     A   B     C     D
# 2   9  10  11.0  12.0
# 4  17  18  19.0  20.0
# (2)删除nan的列
print(df.dropna(axis=1))
#     A   B
# 0   1   2
# 1   5   6
# 2   9  10
# 3  13  14
# 4  17  18
# (3)填充所有nan的数据   相同列的数据类型一样
print(df.fillna(0.0))
#     A   B     C     D
# 0   1   2   3.0   0.0
# 1   5   6   7.0   0.0
# 2   9  10  11.0  12.0
# 3  13  14   0.0  16.0
# 4  17  18  19.0  20.0


# (4)使用map进行指定列nan填充数据
print(df.fillna({'C': 0.0, 'D': -1}))
#     A   B     C     D   C     D
# 0   1   2   3.0  -1.0        -1
# 1   5   6   7.0  -1.0        -1
# 2   9  10  11.0  12.0
# 3  13  14   0.0  16.0  0.0
# 4  17  18  19.0  20.0

3.2.2 DataFrame处理分组统计数据

import numpy as np
import pandas as pd

df = pd.DataFrame({'A': ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
                   'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
                   'C': np.random.randint(8, size=8),
                   'D': np.random.randint(8, size=8)})


print(df)
#      A      B  C  D
# 0  foo    one  2  4
# 1  bar    one  3  1
# 2  foo    two  5  0
# 3  bar  three  4  4
# 4  foo    two  0  1
# 5  bar    two  7  6
# 6  foo    one  6  0
# 7  foo  three  0  3

# 语法 ： df.groupby(统计的列).sum()
# 按A列分组统计foo和bar的数量      统计的是C,D列的数据
print(df.groupby(by='B').sum())

#         C   D
# B
# one     6  19
# three   9  10
# two    13   9

# 先按A列分组统计，在按B列分组统计数量     统计的是C,D列的数据
print(df.groupby(by=['A','B']).sum())
# A   B
# bar one     7  6
#     three   7  2
#     two     6  4
# foo one    11  6
#     three   2  6
#     two     5  9

3.2.3 DataFram练习

import numpy as np
import pandas as pd
# 1.	读取users.csv文件到dataframe中     # 逗号分隔 csv  逗号分隔文件
# read_csv = _make_parser_function('read_csv', default_sep=',')   默认的分隔符,传的是逗号
df=pd.read_csv("users.csv",encoding="utf-8")
print(df)

# 2.	获取前20条记录
print(df.head(20))
# 3.	这个数据集一共有多少条数据
print(df.shape[0])  # 943
print(df.index.size)  # 943
# 4.	一共有几列
print(df.shape[1]) # 5
print(df.columns.size) # 5
# 5.	获取所有的列名及行索引名
print(list(df.index)) # [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702, 703, 704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729, 730, 731, 732, 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761, 762, 763, 764, 765, 766, 767, 768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793, 794, 795, 796, 797, 798, 799, 800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825, 826, 827, 828, 829, 830, 831, 832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, 860, 861, 862, 863, 864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889, 890, 891, 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921, 922, 923, 924, 925, 926, 927, 928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940, 941, 942]
print(list(df.columns))  # ['user_id', 'age', 'gender', 'occupation', 'zip_code']

# 6.	打印出occupation这一列的内容
print(df['occupation'])
print(df.occupation)
# 7.	一共有多少个不同的职位
print(df['occupation'].unique())
print(df['occupation'].unique().size) # 21

# 8.	最多出现的职位有哪些
print(df['occupation'].value_counts().head(10))
# student          196
# other            105
# educator          95
# administrator     79
# engineer          67
# programmer        66
# librarian         51
# writer            45
# executive         32
# scientist         31

# 9.	所有user的平均年龄
print(df['age'].mean())

# 10.	所有性别为男性的平均年龄
df=df[df['gender']=='M']
print(df)
print(df['age'].mean()) # 34.149253731343286

6.Python数据可视化

6.1 绘图基础知识1

import numpy as np
import pandas as pd

# 1.	读取Pokemon.csv
df = pd.read_csv('Pokemon.csv', encoding='utf-8')

# 2.	将列名同统一转成小写
df_index = df.columns.str.lower()  # 获取表格的列索引
df.columns = df_index
# print(df)

# 3.	将#列删除
df = df.drop('#', axis=1)

# 4.	将name列设置为行的索引列
# df.index = df['name'].values
# df = df.drop('name', axis=1)
df = df.set_index('name')

# 5.	选出legendary的宠物
df[df['legendary'] == True]

# 6.	查看Pikachu宠物的所有属性
df.ix['Pikachu']

# 7.	查看一共有多少种宠物的类型
df['type 1'].unique()

# 8.	筛选出既是火系又是龙系的宠物
df[((df['type 1']=='Fire') & (df['type 2']=='Dragon')) |
  ((df['type 1']=='Dragon') & (df['type 2']=='Fire'))]

# 9.	总属性值最高的三个
df.sort_values(by='total', ascending=False).head(3)

# 10.	火系中攻击力最高的三个宠物
data = df[(df['type 1']=='Fire') | (df['type 2']=='Fire')]
data.sort_values(by='attack', ascending=False).head(3)

# 11.	计算每个类型宠物的数量（分类统计）
df['type 1'].value_counts()
df['type 2'].value_counts()

# 12.	统计Water系宠物的数量
water = df[(df['type 1']=='Water') | (df['type 2']=='Water')]
print(water.shape[0])

6.2 绘图基础知识2

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

# 1.读取csv文件并处理数据
df = pd.read_csv(r'Pokemon.csv', encoding='utf-8')
df = df.drop("#", axis=1)
df = df.set_index('Name')  # 将Name作为 行的索引

# 2.绘制直方图 绘制宠物攻击力直方图
# print(df['Attack'])
def draw_hist():
    bins = range(0, 200, 25)
    # 统计出 攻击力在bins 范围的个数
    plt.hist(df['Attack'],bins=bins,   width=18, color="b", edgecolor='y')
    plt.title('宠物攻击力分布直方图')
    plt.xlabel('攻击力')
    plt.ylabel('数量')
    # axvline 画一条垂直线
    plt.axvline(df['Attack'].mean(), linestyle="dashed", color="r")
    plt.show()

# draw_hist()


def draw_scatter():
    water = df[(df['Type 1'] == 'Water') | (df['Type 2'] == 'Water')]
    fire = df[(df['Type 1'] == 'Fire') | (df['Type 2'] == 'Fire')]
    plt.scatter(water.Attack, water.Defense, label='Water', color='blue')
    plt.scatter(fire.Attack, fire.Defense, label='Fire', color='red', marker='*')
    plt.legend()  # 生成图例
    plt.xlabel("攻击力")
    plt.title("水系火系宠物攻击力/防御力分布散点图", color='red', fontsize=20)
    plt.ylabel("防御力")
    plt.show()

# draw_scatter()


# 散点直方图
def draw_jointplot():
    water = df[(df['Type 1'] == 'Water') | (df['Type 2'] == 'Water')]
    sns.jointplot(water.Attack, water.Defense)
    plt.show()


def draw_countplot():
    global df
    data = df['Type 1'].value_counts().head(10)
    index = list(data.index)
    df = df[df['Type 1'].isin(index)]
    sns.set(style="darkgrid")
    sns.countplot(x='Type 1', data=df)  # hue='legendary'
    plt.show()


#饼状图
# def draw_pie():
#     x = df['Type 1'].values
#     labels = list(df['Type 1'].index) # 标签
#
#     plt.pie(x,labels=labels,)

6.3 绘图基础知识3

import numpy as np
import pandas as pd
import  matplotlib as mb
import  seaborn as sb


# 数据分析Pandas强化
# 1.	读取Pokemon.csv
df =pd.read_csv('Pokemon.csv',encoding='utf-8')
print(df)
# 2.	将列名同统一转成小写
print(df.columns)
# Index(['#', 'Name', 'Type 1', 'Type 2', 'Total', 'HP', 'Attack', 'Defense',
#        'Sp. Atk', 'Sp. Def', 'Speed', 'Generation', 'Legendary'],
#       dtype='object')
print(df.columns.str) # <pandas.core.strings.StringMethods object at 0x0000000001ECF518>

# 获取表格的列索引
df_index=df.columns.str.lower()
# 字啊此赋值
df.columns=df_index
print(df.columns)
# Index(['#', 'name', 'type 1', 'type 2', 'total', 'hp', 'attack', 'defense',
#        'sp. atk', 'sp. def', 'speed', 'generation', 'legendary'],
#       dtype='object')
# 3.	将#列删除
df=df.drop("#",axis=1)
print(df)
# 4.	将name列设置为行的索引列
# df.index=df['name'].values
# print(df)
# df=df.drop('name',axis=1)
# print(df)

# 上面两步的简单写法
df=df.set_index('name')
print(df)


# 5.	选出legendary的宠物
print(df[df['legendary']==True])


# 6.	查看Pikachu宠物的所有属性
print(list(df.ix['Pikachu']))
# ['Electric', nan, 320, 35, 55, 40, 50, 50, 90, 1, False]

# 7.	查看一共有多少种宠物的类型  字母我已经全部转小写了(分大小写的)   中间有空格注意
print(df['type 1'].unique())
# ['Grass' 'Fire' 'Water' 'Bug' 'Normal' 'Poison' 'Electric' 'Ground'
#  'Fairy' 'Fighting' 'Psychic' 'Rock' 'Ghost' 'Ice' 'Dragon' 'Dark' 'Steel'
#  'Flying']

# 8.	筛选出既是火系又是龙系的宠物
print("8*************")
fire_dradon=df[((df['type 1']=='Fire')&(df['type 2']=='Dragon'))|((df['type 1']=='Dragon')&(df['type 2']=='Fire'))]
print(fire_dradon)
#                            type 1  type 2  total  ...  speed  generation  legendary
# name                                              ...
# CharizardMega Charizard X    Fire  Dragon    634  ...    100           1      False
# Reshiram                   Dragon    Fire    680  ...     90           5       True

# 9.	总属性值最高的三个
# 以总属性降序排序,并出来前三个
print("9***********")
print(df.sort_values(by='total',ascending=False).head(3))
#                         type 1    type 2  total  ...  speed  generation  legendary
# name                                             ...
# RayquazaMega Rayquaza   Dragon    Flying    780  ...    115           3       True
# MewtwoMega Mewtwo Y    Psychic       NaN    780  ...    140           1       True
# MewtwoMega Mewtwo X    Psychic  Fighting    780  ...    130           1       True

# 10.	火系中攻击力最高的三个宠物
print("10***************")
typefire=df[(df['type 1']=='Fire')| (df['type 2']=='Fire')]

print(typefire.sort_values(by='attack',ascending=False).head(3))

#                          type 1    type 2  total  ...  speed  generation  legendary
# name                                              ...
# GroudonPrimal Groudon    Ground      Fire    770  ...     90           3       True
# BlazikenMega Blaziken      Fire  Fighting    630  ...    100           3      False
# DarmanitanStandard Mode    Fire       NaN    480  ...     95           5      False


# 11.	计算每个类型宠物的数量（分类统计）
print("11******")
print(df['type 1'].value_counts())
# Water       112
# Normal       98
# Grass        70
# Bug          69
# Psychic      57
# Fire         52
# Rock         44
# Electric     44
# Dragon       32
# Ground       32
# Ghost        32
# Dark         31
# Poison       28
# Steel        27
# Fighting     27
# Ice          24
# Fairy        17
# Flying        4
print(df['type 2'].value_counts())

# 12.	统计Water系宠物的数量
typewater=df[(df['type 1']=='Water')| (df['type 2']=='Water')]
print(typewater.shape[0])  # 126

6.4 基于Matplotlib数据分析制图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 数据分析Pandas强化
# 1.读取Pokemon.csv
df = pd.read_csv('Pokemon.csv', encoding='utf-8')
# 删除第一列的 #
df = df.drop("#", axis=1)
# 将name作为行的索引
df = df.set_index('Name')
# 注意属性和字段的名字大小写
# print(df)

# 中文乱码的处理
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']  # SimHei黑体
plt.rcParams['axes.unicode_minus'] = False


# (1)直方图 hist ：
# 绘制所有宠物攻击力直方图

def draw_hist():
    # 范围0到200 ,每个柱的宽度20
    bins = range(0, 200, 20)
    # df['Attack']:数据源    width=18 整个柱形的宽度  与bins里的值配合 0    color="b" 柱填充颜色   edgecolor 柱体边框颜色
    plt.hist(df['Attack'], bins, width=18, color="b", edgecolor='y')
    plt.title('宠物攻击力分布直方图')
    plt.xlabel('攻击力')
    plt.ylabel('数量')
    # df['Attack'].mean():求攻击力的平均值  linestyle="dashed" 线型   color="r" 线颜色
    plt.axvline(df['Attack'].mean(), linestyle="dashed", color="r")
    plt.show()


# draw_hist()


# (2)散点图 scatter：
# 水系火系宠物攻击力/防御力分布散点图
def draw_scatter():
    water = df[(df['Type 1'] == 'Water') | (df['Type 2'] == 'Water')]
    fire = df[(df['Type 1'] == 'Fire') | (df['Type 2'] == 'Fire')]
    # water.attack 0轴参数   water.defense 1轴参数  label='Water' 标签    color='blue'颜色
    plt.scatter(water.Attack, water.Defense, label='Water', color='blue')
    plt.scatter(fire.Attack, fire.Defense, label='Fire', color='red', marker='*')
    # 生成图例
    plt.legend()
    plt.xlabel("攻击力")
    plt.title("水系火系宠物攻击力/防御力分布散点图", color='red', fontsize=18)
    plt.ylabel("防御力")
    # 显示图片
    plt.show()




# (3) 基于seaborn的散点直方图 jointplot：
# 水系宠物攻击力/防御力分布散点直方图     在散点图上面加了直方图(看着很丑)
def draw_jointplot():
    # 找到类型为水性的值
    water = df[(df['Type 1'] == 'Water') | (df['Type 2'] == 'Water')]
    # water.Attackv  x轴    water.Defense y轴
    sns.jointplot(water.Attack, water.Defense)
    plt.show()



# (4)基于seaborn的分类统计图 countplot：
# 宠物类型数量排名前十分类统计图
def draw_countplot(df):
    # df['Type 1']计数,自动排序,拿到前10的数据
    data = df['Type 1'].value_counts().head(10)
    # list(data.index) 所有数据的行索引值
    index = list(data.index)
    # df 重新赋值了,df就当做局部变量了 所以需要传参数  或者加一个 global df
    df = df[df['Type 1'].isin(index)]
    # 画图风格  style的参数   white, dark, whitegrid, darkgrid, ticks
    sns.set(style="darkgrid")
    # x='Type 1' 是X轴的标签名     data=df 经过筛选之后的前10行数据
    sns.countplot(x='Type 1', data=df)  # hue='legendary'
    # 展示图片
    plt.show()

print("基于seaborn分类统计图")
# draw_countplot(df)

data=df['Type 1'].value_counts().head(10)
index=list(data.index)   # ['Water', 'Normal', 'Grass', 'Bug', 'Psychic', 'Fire', 'Rock', 'Electric', 'Ghost', 'Ground']
df = df[df['Type 1'].isin(index)]
# print(data)
# print(index)
# print(df)

# draw_countplot(df)

# (5)饼状图
def draw_pie():
    Info=df['Type 1'].value_counts()
    # labels类型的值
    labels= df['Type 1'].value_counts().index
    # sizes相应的值  # [112  98  70  69  57  52  44  44  32  32  32  31  28  27  27  24  17   4]
    sizes = df['Type 1'].value_counts().values
    # 0.1是突出的一部分
    explode=(0,0,0.1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0)
   # autopct="%1.1f %%"  整数有一位,小数有一位的浮点数            圆里面的文本格式，%3.1f%%表示小数有三位，整数有一位的浮点数
    plt.pie(sizes,explode=explode,labels=labels,autopct="%1.1f %%")
    plt.axis('equal') # 圆型
    plt.title("不同类型宠物比例")
    plt.show()  # 各系比例饼图

# draw_pie()

Info = df['Type 1'].value_counts()
labels =df['Type 1'].value_counts().index
size = df['Type 1'].value_counts().values
# print(Info)
# print(labels)
# print(size)


# (6)箱线图
# 所有宠物各属性值 箱型图 分布
def draw_box():
    # 删除指定'Generation','Total','Legendary'三列   给整个列表数据,就得到了
    df2=df.drop(['Generation','Total','Legendary'],axis=1)
    # whis参数指胡须的长度是盒子长度的几倍,超出这个值被认为是离群点(异常值),默认1.5倍
    sns.boxplot(data=df2,whis=1.5)
    # 设置y轴范围
    plt.ylim(0,300)
    plt.show()


#draw_box()
df2=df.drop(['Generation','Total','Legendary'],axis=1)
#print(df2)  # Type 1  Type 2   HP  ...  Sp. Atk  Sp. Def  Speed    有这么多字段,其中Type1 Type2不是数据类型,是可以自动不到想先图里面的


# 宠物属性分类的箱线图
def draw_box2():
    plt.title('以Tpye1分类')
    # x轴 x="Type 1"   y轴 y='Attack'  是Attack的值
    sns.boxplot(x="Type 1",y='Attack',data=df)
    # 设置y轴范围
    plt.ylim(0.200)
    plt.show()
    #提琴图,和盒子作用类似,但不是显示真实值,显示的是概率分布
    sns.violinplot(x="Type 1",y="Attack",data=df)
    # 设置y轴范围
    plt.ylim(0.200)
    plt.show()

# draw_box2()

# (7)小提琴图
# 两代之间宠物属性对比
# 'Generation 中只有1,2, 最后的图就是这两个对比
def draw_violin():
    # 筛选出'Generation'的值是1 ,是2的所有行
    data=df[df['Generation'].isin([1,2])]
    # 筛选出Type 1 中是 'Fire','Water','Grass','Dragon' 这列字段的行
    data=data[data['Type 1'].isin(['Fire','Water','Grass','Dragon'])]
    # x='Type 1'   x轴的名称  // y='Total  y轴名称   //  hue='Generation' 要分类的字段
    sns.violinplot(x='Type 1',y='Total',hue='Generation',data=data, split=True)
    plt.show()



#draw_violin()
data=df[df['Generation'].isin([1,2])]
# print(data)
data=data[data['Type 1'].isin(['Fire','Water','Grass','Dragon'])]
# print(data)

#(8)类别散布图
def draw_swarm():
    # 每个类型的类别散布图(Type 1中前10的数)
    top_types=df['Type 1'].value_counts()[:10]
    # 取出数量最多的类型
    df1 =df[df['Type 1'].isin(top_types.index)]
    # 每一点代表一个宠物  x轴是'Type 1    y轴是HP  Legendary 是分类的标准
    sns.swarmplot(x='Type 1',y='HP',data=df1,hue='Legendary')
    # 均值线
    plt.axhline(df1['HP'].mean(),color='red',linestyle='dashed')
    plt.show()


draw_swarm()
top_types=df['Type 1'].value_counts()[:10]
print(top_types)
# Water       112
# Normal       98
# Grass        70
# Bug          69
# Psychic      57
# Fire         52
# Rock         44
# Electric     44
# Ground       32
# Ghost        32
df1 =df[df['Type 1'].isin(top_types.index)]
print(df1)  # 只要前10
print("mean**************")
print(df1['HP'].mean)


# (9)折线图
def draw_group():
    # 每一代宠物类别数量的变化
    a=df.groupby(['Generation','Type 1']).count().reset_index()
    a=a[['Generation','Type 1','Total']]
    # 画图
    # pivot转成宽格式   index='Generation' 图形横轴的分类   columns='Type 1'折线的类别    values='Total'纵坐标的值
    a=a.pivot(index='Generation',columns='Type 1',values='Total')
    # 选出一部分画  columns='Type 1' 中选出一部分
    a=a[['Water','Fire','Grass','Dragon','Normal','Rock','Electric']]
    a.plot(marker='o')
    # 终端显示图片
    plt.show()

# draw_group()   以'Generation'为第一个分类    以'Type 1' 为在第一个类下的第二个分类
a=df.groupby(['Generation','Type 1']).count()
# print(a)
# 将Generation前面的1 2 3 ...全部加上去
a=df.groupby(['Generation','Type 1']).count().reset_index()
# print(a)
# draw_group()


#(10)词云图

#(11)雷达图

6.5 自定义图片词云图

from PIL import Image
from wordcloud import WordCloud, ImageColorGenerator
import matplotlib.pyplot as plt
import numpy as np
import  jieba
def GetWordCloud():
   path_txt = 'jack3.txt'
   path_img = "timg2.jpg"
   f = open(path_txt, 'r', encoding='UTF-8').read()
   background_image = np.array(Image.open(path_img))
   # 结巴分词，生成字符串，如果不通过分词，无法直接生成正确的中文词云,感兴趣的朋友可以去查一下，有多种分词模式
   #Python join() 方法用于将序列中的元素以指定的字符连接生成一个新的字符串。
   cut_text = " ".join(jieba.cut(f))

   wordcloud = WordCloud(
       # 设置字体，不然会出现口字乱码，文字的路径是电脑的字体一般路径，可以换成别的
       font_path="C:/Windows/Fonts/simfang.ttf",
       background_color="white",
       # mask参数=图片背景，必须要写上，另外有mask参数再设定宽高是无效的
       mask=background_image).generate(cut_text)
   # 生成颜色值
   image_colors = ImageColorGenerator(background_image)
   # 下面代码表示显示图片
   plt.imshow(wordcloud.recolor(color_func=image_colors), interpolation="bilinear")
   plt.axis("off")
   plt.show()

if __name__ == '__main__':
   GetWordCloud()