基本概念
字符编码层
字符编码层主要对字符进行编码,经过多个卷积层和池化层,再经过多个高速链路层和投影层,作为词的Embedding送入后续的层中。
inputs先假设为[batch_size, seq_len, w_dim],其中batch_size代表句子个数维度,也可以当成batch维度,seq_len表示最大句子长度,w_dim表示最大词长度,默认为50。
实现代码:
# 字符编码层
# shape:(batch_size, seq_len, w_dim)=>(batch_size, seq_len, p_dim)
class CharEncoder(layers.Layer):
# num_chars表示数据有多少不同的字符,也就是字符表(类似词汇表)的大小
# c_dim表示每个字符映射为c_dim维的向量
# num_highways表示通过的高速网络数
# p_dim表示投影层输出的维度
def __init__(self, w_dim, num_chars, c_dim, p_dim):
super(CharEncoder, self).__init__()
self.w_dim = w_dim
# multi-scale CNN
# 字符嵌入层,将单个字符映射为向量
# w_dim表示每个单词映射成向量的维数
self.embedding = layers.Embedding(num_chars, c_dim, trainable=True)
# 七个一维卷积层,分别在序列上做卷积
# 卷积核大小kernel_size=[1, 2, 3, 4, 5, 6, 7],通道大小filters=[32, 64, 128, 256, 512, 1024, 2048]
filters_list = [16, 16, 32, 64]
kernel_size_list = [1, 2, 3, 4]
conv1d_list = zip(filters_list, kernel_size_list)
self.conv1ds = [layers.Conv1D(filters, kernel_size) for filters, kernel_size in conv1d_list]
# 七个一维最大池化层,分别对卷积后的序列进行最大池化
# 池化矩阵大小pool_size=[c_dim-1+1, c_dim-2+1, ..., c_dim-7+1]
pool_size_list = [w_dim, w_dim - 1, w_dim - 2, w_dim - 3]
self.maxpool1ds = [layers.MaxPool1D(pool_size) for pool_size in pool_size_list]
num_activations = 4
self.activations = [layers.Activation("tanh") for _ in range(num_activations)]
# Highway
# 用两个Dense层来实现高速层
units = sum(filters_list)
self.dense1 = layers.Dense(units, activation="sigmoid")
self.dense2 = layers.Dense(units, activation="relu")
# Projection
# 使用一个Dense层来实现投影层
self.dense3 = layers.Dense(p_dim, activation="relu")
def __call__(self, x):
batch_size, seq_len = tf.shape(x)[0], tf.shape(x)[1]
# Reshape
# shape:(batch_size, seq_len, w_dim)=>(batch_size * seq_len, w_dim)
x = tf.reshape(x, (-1, self.w_dim))
# 字符嵌入
# shape:(batch_size * seq_len, w_dim)=>(batch_size * seq_len, w_dim, c_dim)
embedding_output = self.embedding(x)
# 分别进行卷积
# shape1:(batch_size * seq_len, w_dim, c_dim)=>(batch_size * seq_len, w_dim1, c_dim1)
# ...
# shape7:(batch_size * seq_len, w_dim7, c_dim7)=>(batch_size * seq_len, w_dim7, c_dim7)
conv1d_outputs = [conv1d(embedding_output) for conv1d in self.conv1ds]
# print(conv1d_outputs[0].shape)
# 分别进行池化
# shape1:(batch_size * seq_len, w_dim1, c_dim1)=>(batch_size * seq_len, 1, c_dim1)
# ...
# shape7:(batch_size * seq_len, w_dim7, c_dim7)=>(batch_size * seq_len, 1, c_dim7)
maxpool1d_outputs = [maxpool1d(conv1d_outputs[i]) for i, maxpool1d in enumerate(self.maxpool1ds)]
# 激活函数
maxpool1d_outputs = [activation(maxpool1d_outputs[i]) for i, activation in enumerate(self.activations)]
# 降低维度
# shape:(batch_size * seq_len, 1, c_dim)=>(batch_size * seq_len, c_dim)
maxpool1d_outputs = [tf.squeeze(maxpool1d_output, axis=1) for maxpool1d_output in maxpool1d_outputs]
# Concat
# shape:=>(batch_size * seq_len, c_dim1+c_dim2+...+c_dim7)
output = tf.concat(maxpool1d_outputs, axis=-1)
# Reshape
# shape:(batch_size * seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)
output = tf.reshape(output, (batch_size, seq_len, -1))
# 通过多条高速网络
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)
num_highways = 2
for _ in range(num_highways):
carry_gate = self.dense1(output)
transform_gate = self.dense2(output)
output = carry_gate * transform_gate + (1 - carry_gate) * output
# 通过投影层
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, p_dim)
output = self.dense3(output)
return output
BiLMs层
ELMo主要建立在BiLMs(双向语言模型)上,结构如下: h表示LSTM单元的维度,用h_dim表示。在每一层结束后还需要一个Linear层将维度从h_dim映射为p_dim,然后再输入到下一层。最后将每一个层的输出和Embedding层的输入进行连接,得到[l+1, batch_size, seq_len, 2*p_dim],其中Embedding层的输出需要复制一份,以保持和BiLMs每层的输出维度一致。
实现代码,以两层BiLMs为例:
# BiLMs层
# shape:(batch_size, seq_len, p_dim)=>(3, batch_size, seq_len, 2 * p_dim)
class BiLMs(layers.Layer):
# h_dim表示LSTM内部的维数
# p_dim表示CharEncoder中投影层映射的维数
def __init__(self, p_dim, h_dim):
super(BiLMs, self).__init__()
# 使用两个单向LSTM来代替一个双向LSTM
self.lstm_fw1 = layers.LSTM(h_dim, return_sequences=True)
self.lstm_bw1 = layers.LSTM(h_dim, return_sequences=True, go_backwards=True)
self.dense_fw1 = layers.TimeDistributed(layers.Dense(p_dim))
self.dense_bw1 = layers.TimeDistributed(layers.Dense(p_dim))
# 一共有两层
self.lstm_fw2 = layers.LSTM(h_dim, return_sequences=True)
self.lstm_bw2 = layers.LSTM(h_dim, return_sequences=True, go_backwards=True)
self.dense_fw2 = layers.TimeDistributed(layers.Dense(p_dim))
self.dense_bw2 = layers.TimeDistributed(layers.Dense(p_dim))
def __call__(self, x):
# 第一层LSTM和Dense
lstm_fw1_output = self.lstm_fw1(x)
lstm_bw1_output = self.lstm_bw1(x)
# shape:(batch_size, seq_len, h_dim)=>(batch_size, seq_len, p_dim)
dense_fw1_output = self.dense_fw1(lstm_fw1_output)
dense_bw1_output = self.dense_bw1(lstm_bw1_output)
# 进行连接
# shape:(batch_size, seq_len, p_dim)=>(batch_size, seq_len, 2 * p_dim)
output1 = tf.concat([dense_fw1_output, dense_bw1_output], axis=-1)
# 第二层LSTM和Dense
lstm_fw2_output = self.lstm_fw2(dense_bw1_output)
lstm_bw2_output = self.lstm_bw2(dense_fw1_output)
dense_fw2_output = self.dense_fw2(lstm_fw2_output)
dense_bw2_output = self.dense_bw2(lstm_bw2_output)
# 进行连接
output2 = tf.concat([dense_fw2_output, dense_bw2_output], axis=-1)
# 对于输入数据复制并扩张维度,以保持和需要连接的数据的维度一致
# shape:(batch_size, seq_len, p_dim)=>(batch_size, seq_len, 2 * p_dim)
x = tf.concat([x, x], axis=-1)
# shape:(batch_size, seq_len, 2 * p_dim)=>(1, batch_size, seq_len, 2 * p_dim)
x = tf.expand_dims(x, axis=0)
output1 = tf.expand_dims(output1, axis=0)
output2 = tf.expand_dims(output2, axis=0)
# 进行连接生成词向量
# shape:=>(3, batch_size, seq_len, 2 * p_dim)
output = tf.concat([x, output1, output2], axis=0)
return output
混合层
经过BilMs层之后,输出维度为[l+1, batch_size, seq_len, 2*p_dim]。对于每一个token
t
k
t_k
tk?,L层的BiLMs,生成出来的表征为2L + 1。得到的ELMo向量为: 这里的
x
k
L
M
x_k^{LM}
xkLM?是词的embedding输出,
h
k
,
j
L
M
=
[
h
→
k
,
j
L
M
,
h
←
k
,
j
L
M
]
h_{k, j}^{LM}=[\overrightarrow{h}_{k,j}^{LM}, \overleftarrow{h}_{k,j}^{LM}]
hk,jLM?=[h
k,jLM?,h
k,jLM?]表示每一层的正向和反向输出后拼接的结果。
混合层scalar mixer实现的操作是:
这里的
s
j
t
a
s
k
s_j^{task}
sjtask?是一个softmax后的概率值,
γ
t
a
s
k
\gamma^{task}
γtask对于整个ELMo向量进行scale上的缩放。视具体任务而定,不同的任务应设定不同的值。
实现代码:
# 混合层
# shape:(3, batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, 2 * p_dim)
class ScalarMixer(layers.Layer):
def __init__(self, gamma):
super(ScalarMixer, self).__init__()
self.gamma = gamma
self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
def __call__(self, x):
output1, output2, output3 = x[0], x[1], x[2]
softmax_output1 = tf.nn.softmax(output1)
ln1_output = self.ln1(softmax_output1)
softmax_output2 = tf.nn.softmax(output2)
ln2_output = self.ln2(softmax_output2)
softmax_output3 = tf.nn.softmax(output3)
ln3_output = self.ln3(softmax_output3)
output = self.gamma * (ln1_output * output1 + ln2_output * output2 + ln3_output * output3)
return output
ELMo模型
实现代码:
# ELMo模型
class ELMo(tf.keras.Model):
# vocab_size表示单词的词汇表的大小
# w_dim表示最大的单词长度,默认为50
# num_chars表示字符的词汇表的大小,一般为256
# c_dim表示字符映射的向量维度,一般为16
def __init__(self, vocab_size, w_dim, num_chars, c_dim, p_dim, h_dim, gamma=0.8):
super(ELMo, self).__init__()
self.embedding = layers.Embedding(vocab_size, w_dim, trainable=False)
self.char_encoder = CharEncoder(w_dim, num_chars, c_dim, p_dim)
self.bi_lstm = BiLMs(p_dim, h_dim)
self.scalar_mixer = ScalarMixer(gamma)
self.dense = layers.TimeDistributed(layers.Dense(vocab_size, activation="softmax"))
def __call__(self, x):
# 单词嵌入层
# shape:(batch_size, seq_len)=>(batch_size, seq_len, w_dim)
embedding_output = self.embedding(x)
# 字符编码层
# shape:(batch_size, seq_len, w_dim)=>(batch_size, seq_len, p_dim)
char_encoder_output = self.char_encoder(embedding_output)
# BiLSTM层
# shape:(batch_size, seq_len, p_dim)=>(3, batch_size, seq_len, 2 * p_dim)
bi_lstm_output = self.bi_lstm(char_encoder_output)
# 混合层
# shape:(3, batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, 2 * p_dim)
scalar_mixer_output = self.scalar_mixer(bi_lstm_output)
# 输出层
# shape:(batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, vocab_size)
output = self.dense(scalar_mixer_output)
return output
测试代码
下面展示一个自动写英文的例子: elmo.py
import tensorflow as tf
from tensorflow.keras import layers
# 字符编码层
# shape:(batch_size, seq_len, w_dim)=>(batch_size, seq_len, p_dim)
class CharEncoder(layers.Layer):
# num_chars表示数据有多少不同的字符,也就是字符表(类似词汇表)的大小
# c_dim表示每个字符映射为c_dim维的向量
# num_highways表示通过的高速网络数
# p_dim表示投影层输出的维度
def __init__(self, w_dim, num_chars, c_dim, p_dim):
super(CharEncoder, self).__init__()
# multi-scale CNN
# 字符嵌入层,将单个字符映射为向量
# w_dim表示每个单词映射成向量的维数
# self.reshape = layers.Reshape(w_dim)
# shape:(batch_size * seq_len, w_dim)=>(batch_size * seq_len, w_dim, c_dim)
self.embedding = layers.Embedding(num_chars, c_dim, trainable=True)
# 七个一维卷积层,分别在序列上做卷积
# 卷积核大小kernel_size=[1, 2, 3, 4, 5, 6, 7],通道大小filters=[32, 64, 128, 256, 512, 1024, 2048]
# shape1:(batch_size * seq_len, w_dim, c_dim)=>(batch_size * seq_len, w_dim1, c_dim1)
# ...
# shape7:(batch_size * seq_len, w_dim, c_dim)=>(batch_size * seq_len, w_dim7, c_dim7)
filters_list = [16, 16, 32, 64, 128, 256, 512]
kernel_size_list = [1, 2, 3, 4, 5, 6, 7]
conv2d_list = zip(filters_list, kernel_size_list)
self.conv2ds = [layers.Conv2D(filters, (1, kernel_size)) for filters, kernel_size in conv2d_list]
# 七个一维最大池化层,分别对卷积后的序列进行最大池化
# 池化矩阵大小pool_size=[c_dim-1+1, c_dim-2+1, ..., c_dim-7+1]
# shape1:(batch_size, seq_len, w_dim1, c_dim1)=>(batch_size, seq_len, 1, c_dim1)
# ...
# shape7:(batch_size, seq_len, w_dim1, c_dim1)=>(batch_size, seq_len, 1, c_dim7)
pool_size_list = [w_dim, w_dim - 1, w_dim - 2, w_dim - 3, w_dim - 4, w_dim - 5, w_dim - 6]
self.maxpool2ds = [layers.MaxPool2D((1, pool_size)) for pool_size in pool_size_list]
num_activations = 7
self.activations = [layers.Activation("relu") for _ in range(num_activations)]
# Highway
# 用两个Dense层来实现高速层
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim6)=>(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)
units = sum(filters_list)
self.dense1 = layers.TimeDistributed(layers.Dense(units, activation="sigmoid"))
self.dense2 = layers.TimeDistributed(layers.Dense(units, activation="relu"))
# Projection
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, p_dim)
# 使用一个Dense层来实现投影层
self.dense3 = layers.TimeDistributed(layers.Dense(p_dim, activation="relu"))
def __call__(self, x):
# 字符嵌入
# shape:(batch_size, seq_len, w_dim)=>(batch_size, seq_len, w_dim, c_dim)
embedding_output = self.embedding(x)
# print(embedding_output.shape)
# 分别进行卷积
# shape1:(batch_size, seq_len, w_dim, c_dim)=>(batch_size, seq_len, w_dim1, c_dim1)
# ...
# shape7:(batch_size, seq_len, w_dim, c_dim)=>(batch_size, seq_len, w_dim7, c_dim7)
conv2d_outputs = [conv2d(embedding_output) for conv2d in self.conv2ds]
# print(conv1d_outputs[0].shape)
# 分别进行池化
# shape1:(batch_size, seq_len, w_dim1, c_dim1)=>(batch_size, seq_len, 1, c_dim1)
# ...
# shape2:(batch_size, seq_len, w_dim1, c_dim1)=>(batch_size, seq_len, 1, c_dim7)
maxpool2d_outputs = [maxpool2d(conv2d_outputs[i]) for i, maxpool2d in enumerate(self.maxpool2ds)]
# 激活函数
maxpool2d_outputs = [activation(maxpool2d_outputs[i]) for i, activation in enumerate(self.activations)]
# 降低维度
# shape:(batch_size, seq_len, 1, c_dim)=>(batch_size, seq_len, c_dim)
maxpool2d_outputs = [tf.squeeze(maxpool2d_output, axis=2) for maxpool2d_output in maxpool2d_outputs]
# 连接各个矩阵
# shape:=>(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)
output = tf.concat(maxpool2d_outputs, axis=-1)
# 通过多条高速网络
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)
num_highways = 2
for _ in range(num_highways):
carry_gate = self.dense1(output)
transform_gate = self.dense2(output)
output = carry_gate * transform_gate + (1 - carry_gate) * output
# 通过投影层
# shape:(batch_size, seq_len, c_dim1+c_dim2+...+c_dim7)=>(batch_size, seq_len, p_dim)
output = self.dense3(output)
return output
# # 测试代码
# char_encode = CharEncoder(50, 256, 16, 512)
# x = tf.ones((64, 40, 50))
# char_encode(x)
# BiLMs层
# shape:(batch_size, seq_len, p_dim)=>(3, batch_size, seq_len, 2 * p_dim)
class BiLMs(layers.Layer):
# h_dim表示LSTM内部的维数
# p_dim表示CharEncoder中投影层映射的维数
def __init__(self, p_dim, h_dim):
super(BiLMs, self).__init__()
# 使用两个单向LSTM来代替一个双向LSTM
self.lstm_fw1 = layers.LSTM(h_dim, return_sequences=True)
self.lstm_bw1 = layers.LSTM(h_dim, return_sequences=True, go_backwards=True)
self.dense_fw1 = layers.TimeDistributed(layers.Dense(p_dim))
self.dense_bw1 = layers.TimeDistributed(layers.Dense(p_dim))
# 一共有两层
self.lstm_fw2 = layers.LSTM(h_dim, return_sequences=True)
self.lstm_bw2 = layers.LSTM(h_dim, return_sequences=True, go_backwards=True)
self.dense_fw2 = layers.TimeDistributed(layers.Dense(p_dim))
self.dense_bw2 = layers.TimeDistributed(layers.Dense(p_dim))
def __call__(self, x):
# 第一层LSTM和Dense
lstm_fw1_output = self.lstm_fw1(x)
lstm_bw1_output = self.lstm_bw1(x)
# shape:(batch_size, seq_len, h_dim)=>(batch_size, seq_len, p_dim)
dense_fw1_output = self.dense_fw1(lstm_fw1_output)
dense_bw1_output = self.dense_bw1(lstm_bw1_output)
# 进行连接
# shape:(batch_size, seq_len, p_dim)=>(batch_size, seq_len, 2 * p_dim)
output1 = tf.concat([dense_fw1_output, dense_bw1_output], axis=-1)
# 第二层LSTM和Dense
lstm_fw2_output = self.lstm_fw2(dense_bw1_output)
lstm_bw2_output = self.lstm_bw2(dense_fw1_output)
dense_fw2_output = self.dense_fw2(lstm_fw2_output)
dense_bw2_output = self.dense_bw2(lstm_bw2_output)
# 进行连接
output2 = tf.concat([dense_fw2_output, dense_bw2_output], axis=-1)
# 对于输入数据复制并扩张维度,以保持和需要连接的数据的维度一致
# shape:(batch_size, seq_len, p_dim)=>(batch_size, seq_len, 2 * p_dim)
x = tf.concat([x, x], axis=-1)
# shape:(batch_size, seq_len, 2 * p_dim)=>(1, batch_size, seq_len, 2 * p_dim)
x = tf.expand_dims(x, axis=0)
output1 = tf.expand_dims(output1, axis=0)
output2 = tf.expand_dims(output2, axis=0)
# 进行连接生成词向量
# shape:=>(3, batch_size, seq_len, 2 * p_dim)
output = tf.concat([x, output1, output2], axis=0)
return output
# # 测试代码
# bi_lstm = BiLMs(16, 32)
# x = tf.ones((64, 40, 16))
# output = bi_lstm(x)
# 混合层
# shape:(3, batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, 2 * p_dim)
class ScalarMixer(layers.Layer):
def __init__(self, gamma):
super(ScalarMixer, self).__init__()
self.gamma = gamma
self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
def __call__(self, x):
output1, output2, output3 = x[0], x[1], x[2]
softmax_output1 = tf.nn.softmax(output1)
ln1_output = self.ln1(softmax_output1)
softmax_output2 = tf.nn.softmax(output2)
ln2_output = self.ln2(softmax_output2)
softmax_output3 = tf.nn.softmax(output3)
ln3_output = self.ln3(softmax_output3)
output = self.gamma * (ln1_output * output1 + ln2_output * output2 + ln3_output * output3)
return output
# ELMo模型
class ELMo(tf.keras.Model):
# vocab_size表示单词的词汇表的大小
# w_dim表示最大的单词长度,默认为50
# num_chars表示字符的词汇表的大小,一般为256
# c_dim表示字符映射的向量维度,一般为16
def __init__(self, vocab_size, w_dim, num_chars, c_dim, p_dim, h_dim, gamma=0.8):
super(ELMo, self).__init__()
self.embedding = layers.Embedding(vocab_size, w_dim, trainable=False)
self.char_encoder = CharEncoder(w_dim, num_chars, c_dim, p_dim)
self.bi_lstm = BiLMs(p_dim, h_dim)
self.scalar_mixer = ScalarMixer(gamma)
self.dense = layers.TimeDistributed(layers.Dense(vocab_size, activation="softmax"))
def __call__(self, x):
# 单词嵌入层
# shape:(batch_size, seq_len)=>(batch_size, seq_len, w_dim)
embedding_output = self.embedding(x)
# 字符编码层
# shape:(batch_size, seq_len, w_dim)=>(batch_size, seq_len, p_dim)
char_encoder_output = self.char_encoder(embedding_output)
# BiLSTM层
# shape:(batch_size, seq_len, p_dim)=>(3, batch_size, seq_len, 2 * p_dim)
bi_lstm_output = self.bi_lstm(char_encoder_output)
# 混合层
# shape:(3, batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, 2 * p_dim)
scalar_mixer_output = self.scalar_mixer(bi_lstm_output)
# 输出层
# shape:(batch_size, seq_len, 2 * p_dim)=>(batch_size, seq_len, vocab_size)
output = self.dense(scalar_mixer_output)
return output
# # 测试代码
# elmo = ELMo(2000, 40, 262, 40, 2, 16, 32)
# a = tf.ones((64, 40))
# output = elmo(a)
train_elmo.py
import time
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow.keras import layers, losses, optimizers, metrics, datasets, preprocessing
from ELMo.elmo import ELMo
# 构造数据集
# 获取数据集字典(包含训练、验证、测试数据集)以及元数据
examples, metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
# 获取训练、验证、测试数据集
train_examples, val_examples, test_examples = examples["train"], examples["validation"], examples["test"]
# 构建key-value词典(分词器)
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((en.numpy() for pt, en in val_examples),
target_vocab_size=2 ** 13)
tokenizer_pt = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus((pt.numpy() for pt, en in val_examples),
target_vocab_size=2 ** 13)
# # 测试代码
# sample_str = 'hello world, tensorflow 2'
# tokenized_str = tokenizer_en.encode(sample_str)
# print(tokenized_str)
# original_str = tokenizer_en.decode(tokenized_str)
# print(original_str)
# 数据集预处理
# 添加start、end标记,用vocab_size、vocab_size+1表示
# 输入pt、en为句子,输出为带有首尾的数字标记,都是Tensor张量类型
def encode(pt, en):
pt = [tokenizer_pt.vocab_size] + tokenizer_pt.encode(pt.numpy()) + [tokenizer_pt.vocab_size + 1]
en = [tokenizer_en.vocab_size] + tokenizer_en.encode(en.numpy()) + [tokenizer_en.vocab_size + 1]
return pt, en
# 调用py_function函数,将结果转换成Tensor张量
def tf_encode(pt, en):
return tf.py_function(encode, [pt, en], [tf.int32, tf.int32])
# 过滤掉长度超过40的句子
def filter_sentence(x, y, max_length=40):
return tf.logical_and(tf.size(x) <= max_length, tf.size(y) <= max_length)
# 训练集
train_dataset = train_examples.map(tf_encode)
train_dataset = train_dataset.filter(filter_sentence)
# 从缓存中加速读取
train_dataset = train_dataset.cache()
# 对于长度没有达到40的张量进行填充,输出shape=(64, 40)
train_dataset = train_dataset.padded_batch(64, padded_shapes=([40], [40]))
train_dataset = train_dataset.prefetch(tf.data.experimental.AUTOTUNE)
# 验证集
validate_dataset = val_examples.map(tf_encode)
validate_dataset = validate_dataset.filter(filter_sentence)
validate_dataset = validate_dataset.padded_batch(64, padded_shapes=([40], [40]))
# 构建模型
# elmo = tf.keras.Sequential([
# layers.Embedding(tokenizer_en.vocab_size + 2, 128),
# layers.LSTM(256, return_sequences=True),
# layers.LSTM(256, return_sequences=True),
# layers.Dense(tokenizer_en.vocab_size + 2, activation="softmax")
# ])
elmo = ELMo(vocab_size=tokenizer_en.vocab_size + 2,
w_dim=50,
num_chars=256,
c_dim=16,
p_dim=512,
h_dim=512)
# 损失函数
loss_func = losses.SparseCategoricalCrossentropy()
# 优化器
optimizer = optimizers.Adam(0.004)
# 指标
# 损失值
train_loss = metrics.Mean(name='train_loss')
# 准确率
train_accuracy = metrics.SparseCategoricalAccuracy(name="train_accuracy")
# 检查点
checkpoint_path = "./checkpoint"
checkpoint = tf.train.Checkpoint(elmo=elmo, optimizer=optimizer)
checkpoint_manager = tf.train.CheckpointManager(checkpoint, checkpoint_path, 3)
if checkpoint_manager.latest_checkpoint:
checkpoint.restore(checkpoint_manager.latest_checkpoint)
print("restore")
# 训练方式
@tf.function
def train_step(inputs):
inputs_real = inputs[:, 1:]
inputs = inputs[:, : -1]
with tf.GradientTape() as tape:
prediction = elmo(inputs)
loss = loss_func(inputs_real, prediction)
# 计算梯度
gradients = tape.gradient(loss, elmo.trainable_variables)
# 反向传播
optimizer.apply_gradients(zip(gradients, elmo.trainable_variables))
# 更新指标
train_loss(loss)
train_accuracy(inputs_real, prediction)
# 训练过程
def train(epochs=100):
for epoch in range(epochs):
start = time.time()
train_loss.reset_state()
train_accuracy.reset_state()
for batch, (_, inputs) in enumerate(validate_dataset):
train_step(inputs)
print("epoch:{}, loss:{:.6f}, accuracy:{:.6f}".format(
epoch + 1, train_loss.result(), train_accuracy.result()
))
end = time.time()
print("time in 1 epoch:{:.6f} secs".format(end - start))
if epoch % 10 == 0:
checkpoint_save_path = checkpoint_manager.save()
print('epoch {}, save model at {}'.format(
epoch + 1, checkpoint_save_path
))
def evaluate():
inputs = tf.expand_dims([tokenizer_en.vocab_size], axis=0) # (1, 1)
# 句子长度最大为40,每个循环翻译出一个单词
for i in range(100):
predictions = elmo(inputs)
# 取出要预测的单词
predictions = predictions[:, -1, :]
# 获取单词对应的id
predictions_id = tf.cast(tf.argmax(predictions, axis=-1), dtype=tf.int32)
if tf.equal(predictions_id, tokenizer_en.vocab_size + 1):
inputs = tf.concat([inputs, [predictions_id]], axis=-1)
return tf.squeeze(inputs, axis=0)
# 连接该单词组成新的输入
inputs = tf.concat([inputs, [predictions_id]], axis=-1)
return tf.squeeze(inputs, axis=0)
train()
output = evaluate()
output = tokenizer_en.decode(output[1: -1])
print(output)
|