基于GRU+Attention实现机器翻译任务
https://work.datafountain.cn/forum?id=158&type=2&source=1?
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 64
steps_per_epoch = len(input_tensor_train) // BATCH_SIZE
embedding_size = 256
units = 1024
vocab_input_size = len(input_dict_reverse) + 1
vocab_target_size = len(target_dict_reverse) + 1
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
example_input_batch, example_target_batch = next(iter(dataset))
print(example_input_batch.shape, example_target_batch.shape)
# 初始化Tokenizer,之前已经对句子做了预处理,这里直接按照空格将每个token切分出来
targ_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
# 对目标语言数据集进行tokenize
targ_lang_tokenizer.fit_on_texts(targ_lang)
# 将文本序列转化为编码序列
tensor = targ_lang_tokenizer.texts_to_sequences(targ_lang)
# 把所有句子补齐
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
inp_lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
inp_lang_tokenizer.fit_on_texts(inp_lang)
tensor = inp_lang_tokenizer.texts_to_sequences(inp_lang)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
MAX_INPUT_LENGTH = max([len(i) for i in input_tensor])
MAX_OUTPUT_LENGTH = max([len(i) for i in target_tensor])
print("输入句子的最大长度:", MAX_INPUT_LENGTH)
print("输出句子的最大长度:", MAX_OUTPUT_LENGTH)
print(input_tensor[0])
print(target_tensor[0])
|