Stata与pandas+numpy的常用功能
import pandas as pd
import numpy as np
输入数据
df = pd.DataFrame({'x': [1, 2, 3], 'y': [4, 5, 6]})
读取外部数据
url = ("https://raw.github.com/pandas-dev/pandas/master/pandas/tests/io/data/csv/tips.csv")
df = pd.read_csv(url)
限定输出
tips.head(5)
导出数据
tips.to_csv("tips2.csv")
对列进行操作
tips['total_bill'] = tips['total_bill'] - 2
tips['new_bill'] = tips['total_bill'] / 2
tips = tips.drop("new_bill", axis=1)
对行进行筛选
tips[tips['total_bill']>10]
使用if逻辑创建新列
tips['bucket'] = np.where(tips['total_bill']<10, 'low', 'high')
日期函数
tips['date1'] = pd.Timestamp('2013-01-15')
tips['date2'] = pd.Timestamp('2015-02-15')
tips['date1_year'] = tips['date1'].dt.year
tips['date2_month'] = tips['date2'].dt.month
tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin()
tips['months_between'] = tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M')
保留、删除或重命名特定的列
tips[['sex', 'total_bill', 'tip']]
tips.drop('sex', axis=1)
tips.rename(columns={'total_bill': 'total_bill_2'})
以值为依据排序
tips = tips.sort_values(['sex', 'total_bill'])
确定字符串长度
tips['time'].str.len()
tips['time'].str.rstrip().str.len()
找到某个子字符串在字符串中的位置
tips['sex'].str.find('ale')
以位置为依据提取子字符串
tips['sex'].str[0:1]
提取第n个单词
firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']})
firstlast['First_Name'] = firstlast['String'].str.split(' ', expand=True)[0]
firstlast['Last_Name'] = firstlast['String'].str.split(' ', expand=True)[1]
改变大小写
firstlast = pd.DataFrame({'String': ['John Smith', 'Jane Cook']})
firstlast['upper'] = firstlast['String'].str.upper()
firstlast['lower'] = firstlast['String'].str.lower()
firstlast['title'] = firstlast['String'].str.title()
筛选缺失值
outer_join[outer_join['value_x'].isna()]
outer_join[outer_join['value_x'].notna()]
删除有缺漏值的行
outer_join.dropna()
使用上一行的值填充缺漏值
outer_join.fillna(method='ffill')
使用特定值取代缺漏值
outer_join['value_x'].fillna(outer_join['value_x'].mean())
分组求和
tips_summed = tips.groupby(['sex ', 'smoker'])[['total_bill', 'tip']].sum()
分组求特定的统计量
gb = tips.groupby('smoker')['total_bill']
tips['adj_total_bill'] = tips['total_bill'] - gb.transform('mean')
对每组取第一条记录
tips.groupby(['sex', 'smoker']).first()
将数据进行横向合并:增加变量
left_join = df1.merge(df2, on=['key'], how='left')
right_join = df1.merge(df2, on=['key'], how='right')
inner_join = df1.merge(df2, on=['key'], how='inner')
outer_join = df1.merge(df2, on=['key'], how='outer')
将数据进行纵向合并:追加样本
append_u = pd.read_stata('append_u.dta', index=False)
append_m = pd.read_stata('append_m.dta', index=False)
append_um = pd.concat([append_u, append_m], axis=0])
离群值
nlsw88 = pd.read_stata('work/nlsw88.dta')
wage_q99 = nlsw88['wage'].quantile(0.99)
wage_q1 = nlsw88['wage'].quantile(0.01)
nlsw88['wage'] = nlsw88['wage'].apply(lambda x: wage_q1 if x<wage_q1 else x)
nlsw88['wage'] = nlsw88['wage'].apply(lambda x: wage_q99 if x>wage_q99 else x)
|