一、创建Excel文件?
import pandas as pd
pf = pd.DataFrame()
pf.to_excel('D:/python_work/text1/d.xlsx')
print("Done!")
二、操作Excel文件?
1.往文件里写入数据:
import pandas as pd
pf = pd.DataFrame({'ID':[1,2,3],'Name':['Tim','Victor','Jack']})
pf.set_index('ID', inplace = True)
pf.to_excel('D:/python_work/text1/d.xlsx')
print("Done!")
生成的文件:
data:image/s3,"s3://crabby-images/4d34b/4d34b101aed143f73c0fbb99b7909bde227dde1b" alt=""
?2.读写Excel文件:
import pandas as pd
pf = pd.read_excel('D:/python_work/text1/d.xlsx')
3.查看行数和列数:
import pandas as pd
pf = pd.read_excel('D:/python_work/text1/d.xlsx')
print(pf.shape)#Here
4.查看表头名:
import pandas as pd
pf = pd.read_excel('D:/python_work/text1/d.xlsx')
print(pf.columns)#Here
5.查看“头”和“尾”数据
pf = pd.read_excel('D:/python_work/text1/d.xlsx')
print(pf.head(3))#3的意思是看前三行
print(pf.tail(3))#3的意思是看后三行
6.设置开始读取行数位置
pf = pd.read_excel('D:/python_work/text1/d.xlsx', header=1)
#从excel的第二行开始读取
7.手动设置列头
pf = pd.read_excel('D:/python_work/text1/d.xlsx', header=None)#把header设置为None
pf.columns = ['ID', 'Type','Title','FirstName']
8.复制Excel表
pf = pd.read_excel('D:/python_work/text1/d.xlsx', index_col='ID')
#index_col的作用是设ID为第一个列头
pf.to_excel('D:/python_work/text1/d2.xlsx')
9.通过Series创建行和列
import pandas as pd
s1 = pd.Series([1,2,3], index=[1,2,3], name='A')#index设置的是行号
s2 = pd.Series([10,20,30], index=[1,2,3], name='B')
s3 = pd.Series([100,200,300], index=[1,2,3], name='C')
df = pd.DataFrame({s1.name:s1,s2.name:s2,s3.name:s3} )
print(df)
打印结果:
data:image/s3,"s3://crabby-images/54923/549231f00dabb26de9951b9cab0e14ac2b8d4111" alt=""
10.设置跳过读取的空行和设置读取范围内列,并指定列的数据类型
pf = pd.read_excel('D:/python_work/text1/d.xlsx',skiprows=3,usecols="C,F",index_col='ID',dtype={'ID':str})
#skiprows参数设置跳过行数,usecols设置读取的列标范围。
11.设置列表的值
pf['ID'].at[0] = 100
#设置列头ID的第一行数据的值为100
12.设置列表排序
pf = pd.read_excel('D:/python_work/text1/List.xlsx',index_col='ID')
pf.sort_values(by='Price', inplace=True, ascending=True)
#设置price列里的值从低到高排序
pf = pd.read_excel('D:/python_work/text1/List.xlsx',index_col='ID')
pf.sort_values(by='Price', inplace=True, ascending=False)
#ascending是上升的意思,所以设为False就为下降的意思,使price列里的值从高到低排序
如果要排序两列需这样写:
pf = pd.read_excel('D:/python_work/text1/List.xlsx',index_col='ID')
pf.sort_values(by=['Worthy','Price'],inplace=True,ascending=[True,False])
13.筛选和过滤
import pandas as pd
def age_18_to_30(a):
return 18<=a<30
def leve_a(s):
return 85<=s<=100
pf = pd.read_excel('D:/python_work/text1/Students.xlsx',index_col='ID')
pf = pf.loc[pf['Age'].apply(age_18_to_30)].loc[pf['Score'].apply(leve_a)]
或者用lambda写
pf = pd.read_excel('D:/python_work/text1/Students.xlsx',index_col='ID')
pf = pf.loc[pf['Age'].apply(lambda a:18<=a<30)].loc[pf['Score'].apply(lambda s:85<=s<=100)]
输出结果:?
data:image/s3,"s3://crabby-images/50d38/50d384d4b3e5b8c5a3d3870ca5fd86e7d4aff824" alt=""
14.制图----柱状图
用pandas绘图
import pandas as pd
import matplotlib.pyplot as plt
pf = pd.read_excel('D:/python_work/text1/Students.xlsx')
pf.sort_values(by='Number', inplace=True,ascending=False)#从高到低排序
pf.plot.bar(x='Field', y='Number',color='orange',title='International Student by Field')#设计图表
plt.tight_layout()#紧凑型布局,用于显示全标签
plt.show()#显示图表
效果:
data:image/s3,"s3://crabby-images/8e0cf/8e0cf9f836fc00b409a11c8c013e727f6196abff" alt=""
?用matplotlib绘图:
import pandas as pd
import matplotlib.pyplot as plt
pf = pd.read_excel('D:/python_work/text1/Students.xlsx')
pf.sort_values(by='Number', inplace=True,ascending=False)#从高到低排序
plt.bar(pf.Field,pf.Number,color='orange')#绘制图表x,y轴的内容,颜色为橙色
plt.xticks(pf.Field,rotation='90')#将Field的内容旋转90°
plt.xlabel('Field')#设置x轴标签
plt.ylabel('Number')#设置y轴标签
plt.title('International Student by Field',fontsize = 16)#设置标题且字号为16
plt.tight_layout()#紧凑型布局,用于显示全标签
plt.show()#显示图表
效果:
data:image/s3,"s3://crabby-images/88488/884885e046731b3589ea3999462bc500c0b87856" alt=""
15.制图----分组柱状图?
?pandas和matplotlib相结合
import pandas as pd
import matplotlib.pyplot as plt
pf = pd.read_excel('D:/python_work/text1/Students.xlsx')
pf.sort_values(by='2017',inplace=True,ascending=False)#排序
pf.plot.bar(x='Field',y=['2016','2017'],color=['orange','red'])
plt.title('ISF',fontsize=16,fontweight='bold')
plt.xlabel('Field',fontweight='bold')
plt.ylabel('Number',fontweight='bold')
ax = plt.gca()#!!!获取轴上的内容!!!
ax.set_xticklabels(pf['Field'],rotation=45,ha='right')#编辑x轴上的内容,ha是设置旋转中心点
f = plt.gcf()#!!!获取图形!!!
f.subplots_adjust(left=0.18,bottom=0.4)#编辑图形的位置
plt.show()
效果:
data:image/s3,"s3://crabby-images/ce70f/ce70f81c955c1ac81418b581e63dba234467bbbe" alt=""
16.制图----叠加水平柱状图??
import pandas as pd
import matplotlib.pyplot as plt
pf = pd.read_excel('D:/python_work/text1/Users.xlsx')
pf['Total'] = pf['Oct']+pf['Nov']+pf['Dec']#增加一列
pf.sort_values(by='Total',inplace=True,ascending=False)
print(pf)
pf.plot.barh(x='Name',y=['Oct','Nov','Dec'],stacked=True,title='Users Behaver')
#barh水平柱状图,stacked用于叠加
plt.tight_layout()
plt.show()
效果:
data:image/s3,"s3://crabby-images/6d9c8/6d9c8039ec7d7571ec9464f2df1c6a6957b6d81a" alt=""
17.制图----饼图??
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('D:/python_work/text1/Students.xlsx',index_col='From')
print(students)
students['2017'].plot.pie(fontsize=8,counterclock=False,startangle =-270)
plt.title('Source of International Students', fontsize=16, fontweight='bold')
plt.ylabel('2017', fontsize=12,fontweight='bold')
plt.tight_layout()
plt.show()
效果:
data:image/s3,"s3://crabby-images/48f6c/48f6cf13cc43ff5431afdbf92df5e84fb1767ae8" alt=""
18.制图----折线图??
import pandas as pd
import matplotlib.pyplot as plt
users = pd.read_excel('D:/python_work/text1/Orders.xlsx',index_col='Week')
print(users)
print(users.columns)
users.plot(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
plt.title('Oders',fontsize=16)
plt.ylabel('Total',fontsize=12,fontweight='bold')
plt.xticks(users.index,fontsize=8)#因为week转化为了index,所以.index,x轴更详细
plt.show()
效果:
data:image/s3,"s3://crabby-images/f97c6/f97c6d1aeee7067fd5cbd645f04617ed60729975" alt=""
users.plot.area(y=['Accessories', 'Bikes', 'Clothing', 'Components', 'Grand Total'])
#加上area
效果:
data:image/s3,"s3://crabby-images/ebf21/ebf21b14e6ac3d0d81f118bddb7dbc85282df41e" alt=""
19.制图----散点图??
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 777 #显示所有列
homes = pd.read_excel('D:/python_work/text1/home_data.xlsx')
print(homes.head())
homes.plot.scatter(x='sqft_living', y='price')
plt.show()
效果:
data:image/s3,"s3://crabby-images/cded7/cded7ff303f2f367011f31bd596c05a6043ba8d4" alt=""
20.制图----直方图??
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 777 #显示所有列
homes = pd.read_excel('D:/python_work/text1/home_data.xlsx')
print(homes.head())
homes.sqft_living.plot.hist(bins=100)
plt.xticks(range(0,max(homes.sqft_living),500), fontsize=8,rotation=90)
plt.show()
效果:
data:image/s3,"s3://crabby-images/9e426/9e4264549de08b775b2817b62a2e89139b7def44" alt=""
21.制图----密度图??
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 777 #显示所有列
homes = pd.read_excel('D:/python_work/text1/home_data.xlsx')
print(homes.head())
homes.sqft_living.plot.kde()
plt.xticks(range(0,max(homes.sqft_living),500), fontsize=8,rotation=90)
plt.show()
效果:
data:image/s3,"s3://crabby-images/d82fd/d82fd54cbaf00bd0ba5ce1a864af5bdadf31ab08" alt=""
?22.分析数据的相关性corr():
import pandas as pd
import matplotlib.pyplot as plt
pd.options.display.max_columns = 777 #显示所有列
homes = pd.read_excel('D:/python_work/text1/home_data.xlsx')
print(homes.corr())
输出内容:
data:image/s3,"s3://crabby-images/9c427/9c427bd4ff316e70676d8c2e622e78c32be42c89" alt=""
23.多表联合:
import pandas as pd
import matplotlib.pyplot as plt
students = pd.read_excel('D:/python_work/text1/Student_Score.xlsx', sheet_name='Students', index_col='ID')
scores = pd.read_excel('D:/python_work/text1/Student_Score.xlsx', sheet_name='Scores', index_col='ID')
table = students.join(scores, how='left').fillna(0)
# how参数:因为students先调用所以为左,左边的数据比右边多,默认情况下pandas会删除多出的部分,所以要用到'left'来保留多出的数据
table.Score = table.Score.astype(int)
print(table)
输出内容:
data:image/s3,"s3://crabby-images/55222/5522295dc69434a84673afe0cfa748d28826da0f" alt="" data:image/s3,"s3://crabby-images/a4f08/a4f08b3603824761105bab0c28e866ebbbe4ca19" alt="" data:image/s3,"s3://crabby-images/bcf7c/bcf7c0ab999045e7a53e194c6848bee174f0881c" alt=""
24.数据的校验:
import pandas as pd
def score_validation(row):
try:
assert 0<=row.Score<=100
except:
print(f'#{row.ID}Students{row.Name}has an invaild score{row.Score}')
students = pd.read_excel('D:/python_work/text1/Students.xlsx')
students.apply(score_validation, axis=1)#axis参数:0是从上到下,1是从左到右
输出内容:
?data:image/s3,"s3://crabby-images/ce023/ce023fcddfb25a28e45cc2653257f687c4da1576" alt=""
25.把一列数据分割成两列:
import pandas as pd
employees = pd.read_excel('D:/python_work/text1/Employees.xlsx',index_col='ID')
spl = employees['Full Name'].str.split(expand=True)
employees['FirstName'] = spl[0]
employees['LastName'] = spl[1]
print(employees)
?输出内容:
data:image/s3,"s3://crabby-images/9cb5d/9cb5da54a2b425129b312b05763195d5732bb94a" alt="" data:image/s3,"s3://crabby-images/aae49/aae49ab2f84d51176a984a64d38cc1202c038a01" alt=""
26.求和求平均统计:
import pandas as pd
students = pd.read_excel('D:/python_work/text1/Students19.xlsx',index_col='ID')
temp = students[['Test_1','Test_2','Test_3']]
#每位学生的每一次测试的总成绩
students['Total'] = temp.sum(axis=1).astype(int)
#每位学生的每一次测试的总成绩的平均值,round用于保留两位小数
students['Average'] = round(temp.mean(axis=1).astype(float) ,2)
#每次测试的全班平均值
col_mean = students[['Test_1','Test_2','Test_3','Total','Average']].mean()
col_mean['Name'] = 'Summary'
#添加到students中
students = students.append(col_mean,ignore_index=True)
print(students)
?输出内容:
data:image/s3,"s3://crabby-images/ca969/ca969f6a99a334ef48dc12efd88d57cdabfc79cd" alt="" ?
27.定位清除重复数据:
定位清除数据:
import pandas as pd
students = pd.read_excel('D:/python_work/text1/Students_Duplicates.xlsx',index_col='ID')
students.drop_duplicates(subset='Name', inplace=True,keep='first')
#subset参数是定位重复内容所在的列,keep参数:first值是保留第一次重复出现的数据,last保留最后一次重复出现的数据
print(students)
data:image/s3,"s3://crabby-images/5a89a/5a89a3bb7819fa613c4f0622a8ce5d0d662b8be3" alt="" ?
??扫描哪些行是重复的:
import pandas as pd
students = pd.read_excel('D:/python_work/text1/Students_Duplicates.xlsx',index_col='ID')
#扫描哪些行是重复的
dupe = students.duplicated(subset='Name')
print(dupe)
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?data:image/s3,"s3://crabby-images/44783/44783b2730228282eb0b30c78aaf5efef112a62c" alt=""
?抓取重复的数据:
import pandas as pd
students = pd.read_excel('D:/python_work/text1/Students_Duplicates.xlsx',index_col='ID')
#扫描哪些行是重复的
dupe = students.duplicated(subset='Name')
dupe = dupe[dupe == True]
print(students.loc[dupe.index])#loc[]用来定位
效果:
data:image/s3,"s3://crabby-images/3fd4b/3fd4b54c73aa8ba56a93322f50327fa84feac263" alt=""
28.旋转数据表:
import pandas as pd
pd.options.display.max_columns = 999;
videos = pd.read_excel('D:/python_work/text1/Videos.xlsx')
table = videos.transpose()
print(table)
效果:
data:image/s3,"s3://crabby-images/c436b/c436bbaa6bc591c0b8ae1f4ae445cae1eb8420eb" alt="" data:image/s3,"s3://crabby-images/0386a/0386a971167ecefa0eeb438cb76d3239bc679e73" alt=""
29.读取TXT、TSV、CSV的数据:
import pandas as pd
student1 = pd.read_csv('D:/python_work/text1/excle/Students022.csv', index_col='ID')
student2 = pd.read_csv('D:/python_work/text1/excle/Students022.tsv',sep='\t',index_col='ID')
student3 = pd.read_csv('D:/python_work/text1/excle/Students022.txt',sep='|',index_col='ID')
#sep参数是输入分割符
print(student2)
30.透视表和数据聚合:
import pandas as pd
import numpy as np
pd.options.display.max_columns = 777
orders = pd.read_excel('D:/python_work/text1/excle/Orders023.xlsx')
orders['Year'] = pd.DatetimeIndex(orders['Date']).year
pt1 = orders.pivot_table(index='Category', columns='Year', values='Total', aggfunc=np.sum)
print(pt1)
import pandas as pd
pd.options.display.max_columns = 777
orders = pd.read_excel('D:/python_work/text1/excle/Orders023.xlsx')
orders['Year'] = pd.DatetimeIndex(orders['Date']).year
groups = orders.groupby(['Category', 'Year'])
s = groups['Total'].sum()
c = groups['ID'].count()
pt = pd.DataFrame({'Sum':s,'Count':c})
print(pt)
效果:
data:image/s3,"s3://crabby-images/e5c74/e5c74d45a910967e1c66b700140756a9172fd248" alt="" data:image/s3,"s3://crabby-images/e7a46/e7a466abcbb6c8f7e883691ae5b3232f7db55379" alt=""
31.制图----线性回归、数据预测??:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import linregress
pd.options.display.max_columns = 777
sales = pd.read_excel('D:/python_work/text1/excle/Sales.xlsx',dtype={'Month':str})
slope, intercept, r,p,std_err = linregress(sales.index,sales.Revenue)
#斜率,y轴上的截距, 标准差
exp = sales.index*slope + intercept#期望值
plt.scatter(sales.index, sales.Revenue)
plt.plot(sales.index,exp,color='orange')
plt.title(f"y={slope}*X+{intercept}")
plt.xticks(sales.index, sales.Month, rotation=90)
plt.tight_layout()
plt.show()
效果:
data:image/s3,"s3://crabby-images/0ec00/0ec00c5bfe4efbdd6af62073bbc1cbe355e6d155" alt="" ?
?32.给表格添加格式:
需要用到Jupyter Notebook
import pandas as pd
def low_score_red(s):
color = 'red' if s<60 else 'green'
return f'color:{color}'
def hight_score_green(col):
return ['background-color:lime' if s==col.max() else 'background-color:white'for s in col]
students = pd.read_excel('D:/python_work/text1/excle/Students.xlsx')
students.style.applymap(low_score_red, subset=['Test_1', 'Test_2', 'Test_3']).apply(hight_score_green, subset=['Test_1', 'Test_2', 'Test_3'])
#applymap()是应用到数据的区域,apply()是应用到列或者行
效果:
data:image/s3,"s3://crabby-images/d9c23/d9c239ef869409117430c42f99455440a4bb8249" alt="" data:image/s3,"s3://crabby-images/2f659/2f659390890aeb2314f152b4f675e6e25a266bc2" alt=""
import pandas as pd
students = pd.read_excel('D:/python_work/text1/excle/Students.xlsx')
students.style.bar(color='orange', subset=['Test_1','Test_2','Test_3'])
?data:image/s3,"s3://crabby-images/33bfc/33bfc4ef464e04be41016c3dda508565bd5515ce" alt=""
?33.尾部追加表格、插入表格:
|