Python的datatable ,在一定程度上不乏为pandas有力竞争者,其模仿R中data.table的核心算法和接口,致力于更快的、处理size更大的数据。 ?
?这里分享datatable的101个常用操作?,助快速上手datatable。
0、安装
pip install datatable
1、加载datatable、查看版本号
import datatable as dt
dt.__version__
2、三种方式创建datatable.Frame
import pandas as pd
import numpy as np
import datatable as dt
# Inputs
my_list = list('abcedfghijklmnopqrstuvwxyz')
my_arr = np.arange(26)
my_df = pd.DataFrame(dict(col1=my_list, col2=my_arr))
# Solution
dt_df1 = dt.Frame(my_list) #list创建
dt_df2 = dt.Frame(my_arr) #numpy.ndarray
dt_df3 = dt.Frame(my_df) #pandas.DataFrame
dt_df4 = dt.Frame(A=my_arr, B=my_list)
?3、读取csv文件为datatable.Frame
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv'
)
df.head(5)
?左下方会默认显示行列数,这是pandas不具有的~
4、 读取csv文件前5行
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
max_nrows=5)
df
?5、为datatable.Frame新增一列
# Input
import datatable as dt
df = dt.fread(
'https://raw.githubusercontent.com/selva86/datasets/master/BostonHousing.csv',
max_nrows=5)
# Solution
df[:, "new_column"] = dt.Frame([1, 2, 3, 4, 5]) #新增一列new_column
df
?6、取已有列创建新列
# Input
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# Solution
df[:, "new_column"] = df[:, dt.f.age + dt.f.rad]
df.head()
?7、取已有列整数部分创建新列
# Input
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# Solution
df[:, "new_column"] = df[:, dt.int32(dt.f.dis)]
df.head(5)
?8、按条件创建新列
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
#age列,年龄大于60赋值old,反之为new
df[:, "new_column"] = dt.Frame(np.where(df[:, dt.f.age > 60], 'Old', 'New'))
df.head(5)
?9、left join两个datatable.Frame
import datatable as dt
df1 = dt.Frame(A=[1, 2, 3, 4], B=["a", "b", "c", "d"])
df2 = dt.Frame(A=[1, 2, 3, 4, 5], C=["a2", "b2", "c2", "d2", "e2"])
df2.key = "A"
output = df1[:, :, dt.join(df2)]
print(df1)
print(df2)
output
?10、修改列名称
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df.names = {'zn': 'zn_new'}
df.head(5)
?11、每隔1行读取csv文件
import datatable as dt
import csv
with open('datasets-master/BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
row = [[x] for x in row]
# 1st row
if i == 0:
df = dt.Frame(row)
header = [x[0] for x in df[0, :].to_list()]
df.names = header
del df[0, :]
# Every 2th row
elif i % 2 == 0:
df_temp = dt.Frame(row)
df_temp.names = header
df.rbind(df_temp)
df.head(5)
?原始数据
?12、读入csv文件按条件修改列
import datatable as dt
import csv
with open('datasets-master/BostonHousing.csv', 'r') as f:
reader = csv.reader(f)
for i, row in enumerate(reader):
row = [[x] for x in row]
if i == 0:
df = dt.Frame(row)
header = [x[0] for x in df[0, :].to_list()]
df.names = header
del df[0, :]
else:
row[13] = ['High'] if float(row[13][0]) > 25 else ['Low'] # 最后一列大于25赋值High,否则赋值为Low
df_temp = dt.Frame(row)
df_temp.names = header
df.rbind(df_temp)
df.head(5)
13、修改特定位置的值
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df[2, 1] = 5
df.head(5)
?14、datatable.Frame增删操作
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
#删除特定位置值
del df[2, 1]
#删除行
del df[3, :]
# 删除列
del df[:, "chas"]
# 按条件删除
del df[dt.f.zn == 0, :]
df.head(5)
?15、datatable.Frame与列表/字典/DataFrame等格式互转
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
# to pandas df
pd_df = df.to_pandas()
# to numpy arrays
np_arrays = df.to_numpy()
# to dictionary
dic = df.to_dict()
# to list
list_ = df[:,"indus"].to_list()
# to tuple
tuples_ = df[:,"indus"].to_tuples()
# to csv
df.to_csv("BostonHousing.csv")
16、获取datatable.Frame各列数据类型
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
for i in range(len(df.names)):
print(df.names[i], ":", df.stypes[i])
?17、datatable.Frame基础统计计算
import datatable as dt
df = dt.fread('datasets-master/BostonHousing.csv')
df.sum()
df.max()
df.min()
df.mean()
df.sd()
df.mode()
df.nmodal()
df.nunique()
?18、datatable.Frame中使用group by
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
df[:, dt.mean(dt.f.Price), dt.by("Manufacturer")].head(5)
?19、?datatable.Frame按升序sort
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# 方法1
df.sort("Price")
# 方法2
df[:,:, dt.sort(dt.f.Price)].head(5)
?20、?datatable.Frame按降序sort
import datatable as dt
df = dt.fread('datasets-master/Cars93_miss.csv')
# Solution
df[::-1, :, dt.sort(dt.f.Price)].head()
?
21、数据重复/追加5次
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution
dt.repeat(df,?5)
22、字符串替换
import?datatable?as?dt
df?=?dt.fread('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
#?Solution
df.replace("Audi",?"My?Dream?Car")
df.head(5)
23、按条件提取值
#?Input
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution
#?Get?the?highest?price
print("Highest?Price?:?",?df[:,?dt.f.Price].max()[0,?0])
#?Get?Manufacturer?with?highest?price
df[dt.f.Price?==?df[:,?dt.f.Price].max()[0,?0],
???['Manufacturer',?'Model',?'Type']]
24、修改列名
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution
old_col_name?=?"Model"
new_col_name?=?"Car?Model"
df.names?=?[new_col_name?if?x?==?old_col_name?else?x?for?x?in?df.names]
df.head(5)
25、统计每列NA值
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution
df.countna()
26、取一列
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution
df[:,?"Model"].head(5)
27、列顺序颠倒
#?Input
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
#?Solution?1
df.head()
df[:,::-1].head(5)
28、格式化输出
import?datatable?as?dt
df?=?dt.Frame(random=np.random.random(4)**10)
df[:,?"random2"]?=?dt.Frame(['%.6f'?%?x?for?x?in?df[:,?"random"].to_list()[0]])
df
29、每隔20行按条件过滤
#?Input
import?datatable?as?dt
df?=?dt.fread('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv')
#?Solution
df[::20,?['Manufacturer',?'Model',?'Type']]
30、rows颠倒
import?datatable?as?dt
df?=?dt.fread('datasets-master/Cars93_miss.csv')
print(df.head())
#?Solution
df[::-1,:]
31、按指定值归一化数据
#?Input
import?datatable?as?dt
df?=?dt.fread("datasets-master/BostonHousing.csv")
#?Solution
for?i?in?df.names:
????df[:,i]?=?df[:,(dt.f[i]?-?df[:,dt.min(dt.f[i])][0,0])/(df[:,dt.max(dt.f[i])][0,0]?-?df[:,dt.min(dt.f[i])][0,0])]
df.head(5)
32、group并取均值生成新列
#?Input
import?datatable?as?dt
df?=?dt.Frame(fruit=['apple',?'banana',?'orange']?*?3,
??????????????rating=np.random.rand(9),
??????????????price=np.random.randint(0,?15,?9))
df[:,?dt.mean(dt.f.price),?dt.by("fruit")]
33、按两列join
#?Input
import?datatable?as?dt
df1?=?dt.Frame(A=[1,?2,?3,?4],?B=["a",?"b",?"c",?"d"],?D=[1,?2,?3,?4])
df2?=?dt.Frame(A=[1,?2,?4,?5],
???????????????B=["a",?"b",?"d",?"e"],
???????????????C=["a2",?"b2",?"d2",?"e2"])
#?Solution
df2.key?=?["A",?"B"]
output?=?df1[:,?:,?dt.join(df2)]
output
34、ML小案例
import?datatable?as?dt
from?datatable.models?import?Ftrl
#?Import?data
train_df?=?dt.fread('pima_indian_diabetes_training_data.csv')
test_df?=?dt.fread('pima_indian_diabetes_testing_data.csv')
#?Create?Ftrl?model
ftrl_model?=?Ftrl()
#??add?parameter?values?while?creating?model
ftrl_model?=?Ftrl(alpha=0.1,?lambda1=0.5,?lambda2=0.6)
#?change?paramter?of?existing?model
ftrl_model.alpha?=?0.1
ftrl_model.lambda1?=?0.5
ftrl_model.lambda2?=?0.6
#?Prepare?training?and?test?dataset
train_df[:,?"diabetes"]?=?dt.Frame(
????np.where(train_df[:,?dt.f["diabetes"]?==?"pos"],?1,?0))
test_df[:,?"diabetes"]?=?dt.Frame(
????np.where(test_df[:,?dt.f["diabetes"]?==?"pos"],?1,?0))
x_train?=?train_df[:,?[
????"pregnant",?"glucose",?"pressure",?"mass",?"pedigree",?"age"
]]
y_train?=?train_df[:,?["diabetes"]]
x_test?=?test_df[:,?[
????"pregnant",?"glucose",?"pressure",?"mass",?"pedigree",?"age"
]]
y_test?=?test_df[:,?["diabetes"]]
#?training?the?model
ftrl_model.fit(x_train,?y_train)
#?predictions?of?the?model
targets?=?ftrl_model.predict(x_test)
print(targets.head(5))
#?feature?importance
fi?=?ftrl_model.feature_importances
fi
REF:https://www.machinelearningplus.com/data-manipulation/101-python-datatable-exercises-pydatatable/
|