TensorFlow 中的 CTCGreedyDecoder 仅包含 CPU 实现。而 PaddlePaddle 框架则更贴近实际需求,可以在 GPU 上运行。简单来说,PaddlePaddle 内部通过拼接方式,先通过 topk 算子找到最大类别,然后通过 CTCAlignOp 完成后处理。TensorFlow 的输出格式为 SparseTensor,而 PaddlePaddle 支持 Tensor 和 LoDTensor 两种形式。
check_variable_and_dtype 检查变量的类型以及数据类型。 LayerHelper 主要是在各个 layers 函数之间共享代码。 内部调用 topk 算子得到最大概率类别的索引topk_indices 。
check_variable_and_dtype(input, 'input', ['float32', 'float64'],
'ctc_greedy_decoder')
helper = LayerHelper("ctc_greedy_decoder", **locals())
_, topk_indices = topk(input, k=1)
LayerHelperBase.create_variable_for_type_inference 创建临时变量。 lod 模式直接通过 ctc_align 来得到最终结果;padding 模式下输入是3维的,需要创建ctc_out_len 并调用 squeeze 算子去掉最后一维。
ctc_out = helper.create_variable_for_type_inference(dtype="int64")
if input_length is None:
helper.append_op(
type="ctc_align",
inputs={"Input": [topk_indices]},
outputs={"Output": [ctc_out]},
attrs={"merge_repeated": True,
"blank": blank})
return ctc_out
else:
ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
ctc_input = squeeze(topk_indices, [2])
helper.append_op(
type="ctc_align",
inputs={"Input": [ctc_input],
"InputLength": [input_length]},
outputs={"Output": [ctc_out],
"OutputLength": [ctc_out_len]},
attrs={
"merge_repeated": True,
"blank": blank,
"padding_value": padding_value
})
return ctc_out, ctc_out_len
根据 REGISTER_OPERATOR 可以找到 Python 函数名和算子实现的对应关系。
CTCAlignOp
OperatorWithKernel
OperatorBase
OperatorBase 是网络计算的基本元素。 OperatorWithKernel OP_INOUT_CHECK 确保算子有输入输出。 OperatorWithKernel::IndicateVarDataType 获取变量数据类型。
class CTCAlignOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ctc_align");
OP_INOUT_CHECK(ctx->HasOutput("Output"), "Output", "Output", "ctc_align");
auto input_dims = ctx->GetInputDim("Input");
ctx->SetOutputDim("Output", input_dims);
if (ctx->HasInput("InputLength")) {
ctx->SetOutputDim("OutputLength", {input_dims[0], 1});
}
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
ctx.device_context());
}
};
模板默认是 CPU 实现。
template <typename DeviceContext, typename T>
class CTCAlignKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* input = ctx.Input<LoDTensor>("Input");
auto* output = ctx.Output<LoDTensor>("Output");
size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
bool merge_repeated = ctx.Attr<bool>("merge_repeated");
T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto input_dims = input->dims();
const T* input_data = input->data<T>();
如果是 padding 模式,处理时跳过空白。
if (input->lod().empty()) {
size_t padding_value =
static_cast<size_t>(ctx.Attr<int>("padding_value"));
auto* input_length = ctx.Input<LoDTensor>("InputLength");
const T* input_length_data = input_length->data<T>();
auto* output_length = ctx.Output<LoDTensor>("OutputLength");
T* output_length_data = output_length->mutable_data<T>(ctx.GetPlace());
for (size_t batch_id = 0; batch_id < (unsigned)input_dims[0];
batch_id++) {
T prev_token = -1;
size_t output_idx = 0;
for (size_t i = 0; i < (unsigned)input_length_data[batch_id]; i++) {
size_t input_ind = batch_id * input_dims[1] + i;
if ((unsigned)input_data[input_ind] != blank &&
!(merge_repeated && input_data[input_ind] == prev_token)) {
output_data[batch_id * input_dims[1] + output_idx] =
input_data[input_ind];
++output_idx;
}
prev_token = input_data[input_ind];
}
output_length_data[batch_id] = output_idx;
for (size_t j = output_idx; j < (unsigned)input_dims[1]; j++)
output_data[batch_id * input_dims[1] + j] = padding_value;
}
如果是 lod 模式,调用 ToAbsOffset 得到偏移。
} else {
const size_t level = 0;
auto input_lod = framework::ToAbsOffset(input->lod());
PADDLE_ENFORCE_EQ(
input_dims[0], static_cast<int64_t>(input_lod[level].back()),
platform::errors::InvalidArgument(
"The first dimension %d of CTCAlign operator Input(Input) should "
"be equal to "
"the sum of all sequences' lengths %d.",
input_dims[0], static_cast<int64_t>(input_lod[level].back())));
const size_t num_sequences = input_lod[level].size() - 1;
size_t output_idx = 0;
std::vector<size_t> output_lod0(1, 0);
for (size_t seq_idx = 0; seq_idx < num_sequences; ++seq_idx) {
T prev_token = -1;
for (size_t i = input_lod[level][seq_idx];
i < input_lod[level][seq_idx + 1]; ++i) {
if ((unsigned)input_data[i] != blank &&
!(merge_repeated && input_data[i] == prev_token)) {
output_data[output_idx] = input_data[i];
++output_idx;
}
prev_token = input_data[i];
}
output_lod0.push_back(output_idx);
}
framework::LoD output_lod;
output_lod.push_back(output_lod0);
output->set_lod(output_lod);
output->Resize({static_cast<int64_t>(output_lod0.back()), 1});
if (output_lod0.back() == 0) {
output->Resize({1, 1});
output_data = output->mutable_data<T>(ctx.GetPlace());
output_data[0] = -1;
}
}
}
};
CTCAlignOpCUDAKernel::Compute
PaddingMergeAndDelCudaKernel
MergeAndDelCudaKernel
ExecutionContext::Input 根据名称返回地址。 ExecutionContext::Attr LoDTensor 为 DenseTensor 类型。
template <typename T>
class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()), true,
platform::errors::InvalidArgument(
"CTCAlign operator CUDA kernel must use CUDAPlace "
"rather than CPUPlace."));
auto* input = ctx.Input<LoDTensor>("Input");
auto* output = ctx.Output<LoDTensor>("Output");
const int blank = ctx.Attr<int>("blank");
const int merge_repeated =
static_cast<int>(ctx.Attr<bool>("merge_repeated"));
const T* tokens = input->data<T>();
auto stream = ctx.cuda_device_context().stream();
DenseTensor::lod 返回 DenseTensorMeta 包含的 LoD 对象。 如果输入没有 Level-of-Detail,为普通 Tensor,调用 PaddingMergeAndDelCudaKernel 函数。 DenseTensor::mutable_data 返回数据指针。
if (input->lod().empty()) {
const int padding_value = ctx.Attr<int>("padding_value");
auto input_dims = input->dims();
T* output_data = output->mutable_data<T>({input_dims[0], input_dims[1]},
ctx.GetPlace());
auto* input_length = ctx.Input<LoDTensor>("InputLength");
const T* input_length_data = input_length->data<T>();
auto* output_length = ctx.Output<LoDTensor>("OutputLength");
T* output_length_data =
output_length->mutable_data<T>({input_dims[0], 1}, ctx.GetPlace());
PaddingMergeAndDelCudaKernel<
T><<<32, (input_dims[0] + 32 - 1) / 32, 0, stream>>>(
input_dims[1], tokens, input_length_data, blank, merge_repeated,
padding_value, input_dims[0], output_data, output_length_data);
否则调用 MergeAndDelCudaKernel 使用单线程合并删除。 ToAbsOffset 得到偏移。
} else {
const size_t level = 0;
auto input_lod = framework::ToAbsOffset(input->lod());
const int64_t num_tokens = input->dims()[0];
const size_t num_seq = input_lod[level].size() - 1;
thrust::device_vector<size_t> dev_out_lod0(input_lod[level].size());
size_t* dev_out_lod0_ptr = thrust::raw_pointer_cast(dev_out_lod0.data());
T* output_data = output->mutable_data<T>({num_tokens, 1}, ctx.GetPlace());
paddle::framework::MixVector<size_t> mixv_input_lod(&input_lod[level]);
MergeAndDelCudaKernel<T><<<1, 1, 0, stream>>>(
num_tokens, tokens, num_seq,
mixv_input_lod.CUDAMutableData(ctx.GetPlace()), blank, merge_repeated,
dev_out_lod0_ptr, output_data);
mixv_input_lod.CopyToCPU();
std::vector<size_t> host_out_lod0(dev_out_lod0.begin(),
dev_out_lod0.end());
framework::LoD out_lod;
out_lod.push_back(host_out_lod0);
output->set_lod(out_lod);
output->Resize({static_cast<int64_t>(host_out_lod0.back()), 1});
if (host_out_lod0.back() == 0) {
output->Resize({1, 1});
output->mutable_data<T>(ctx.GetPlace());
phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
output, -1);
}
}
}
};
每个线程处理单个 batch。 如果tokens 不是空白标签并且无需合并时,将数据赋值给输出。
template <typename T>
__global__ void PaddingMergeAndDelCudaKernel(
const int64_t num_token, const T* tokens, const T* tokens_length,
const int blank, const int merge_repeated, const int padding_value,
const int64_t batch_size, T* output, T* output_length) {
int ind = blockIdx.x * blockDim.x + threadIdx.x;
if (ind >= batch_size) return;
int output_idx = ind * num_token;
T prev_token = -1;
for (int i = ind * num_token; i < ind * num_token + tokens_length[ind]; i++) {
if ((unsigned)tokens[i] != blank &&
!(merge_repeated && tokens[i] == prev_token)) {
output[output_idx] = tokens[i];
++output_idx;
}
prev_token = tokens[i];
}
记录输出长度到output_length 。 末尾填充。
output_length[ind] = output_idx - ind * num_token;
for (int i = output_idx; i < ind * num_token + num_token; i++) {
output[i] = padding_value;
}
}
对于每个序列,通过lod0 得到索引。 跳过空白标签以及需要合并的情况。 out_lod0 记录序列起止点的累计索引。
template <typename T>
__global__ void MergeAndDelCudaKernel(const int64_t num_token, const T* tokens,
const size_t num_seq, size_t* lod0,
const int blank, const int merge_repeated,
size_t* out_lod0, T* output) {
int ouput_idx = 0;
out_lod0[0] = 0;
for (int i = 0; i < num_seq; ++i) {
T pre_token = -1;
for (int j = lod0[i]; j < lod0[i + 1]; ++j) {
if (tokens[j] != blank && !(merge_repeated && tokens[j] == pre_token)) {
output[ouput_idx] = tokens[j];
++ouput_idx;
}
pre_token = tokens[j];
}
out_lod0[i + 1] = ouput_idx;
}
}
参考资料:
|