第五章:循环神经网络语言模型(使用RNN实现静态词向量的预训练)
数据
'''
@Filename :5-2rnnlm.py
@Description :
@Datatime :2021/08/26 11:14:21
@Author :qtxu
@Version :v1.0
'''
from vocab import Vocab
from utils import BOS_TOKEN,EOS_TOKEN,BOW_TOKEN,EOW_TOKEN,PAD_TOKEN
from tqdm.auto import tqdm
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from utils import load_reuters, load_pretrained, save_pretrained, get_loader, init_weights
class RnnlmDataset(Dataset):
def __init__(self, corpus, vocab):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
self.pad = vocab[PAD_TOKEN]
for sentence in tqdm(corpus, desc=f"Dataset Construction"):
input = [self.bos]+sentence
targets = sentence+[self.eos]
self.data.append((input, targets))
def __len__(self):
return len(self.data)
def __getitem__(self,i):
return self.data[i]
def collate_fn(self, example):
inputs = [torch.tensor(ex[0]) for ex in example]
targets = [torch.tensor(ex[1]) for ex in example]
inputs = pad_sequence(inputs, batch_first= True, padding_value=self.pad)
targets = pad_sequence(targets, batch_first=True, padding_value=self.pad)
return (inputs, targets)
class RNNLM(nn.Module):
def __init__(self,vocab_size, embedding_dim, hidden_dim):
super(RNNLM,self).__init__()
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
self.rnn = nn.LSTM(embedding_dim, hidden_dim,batch_first=True)
self.output = nn.Linear(hidden_dim,vocab_size)
def forward(self,inputs):
embeds = self.embeddings(inputs)
hidden, _ = self.rnn(embeds)
output = self.output(hidden)
log_probs = F.log_softmax(output,dim=2)
return log_probs
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 16
num_epoch = 10
corpus, vocab = load_reuters()
dataset = RnnlmDataset(corpus, vocab)
dataloader = get_loader(dataset, batch_size)
nll_loss = nn.NLLLoss(ignore_index = dataset.pad)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = RNNLM(len(vocab),embedding_dim,hidden_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.001)
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(dataloader,desc=f"Training Epoch {epoch}"):
inputs, targets = [x.to(device) for x in batch]
optimizer.zero_grad()
log_probs = model(inputs)
loss = nll_loss(log_probs.view(-1, log_probs.shape[-1]),targets.view(-1))
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"loss: {total_loss:.2f}")
save_pretrained(vocab,model.embeddings.weight.data,"5-2rnnlm.vec")
vocab
'''
@Filename :vocab.py
@Description :
@Datatime :2021/08/24 17:02:51
@Author :qtxu
@Version :v1.0
'''
from collections import defaultdict,Counter
class Vocab:
def __init__(self, tokens=None):
self.idx_to_token = list()
self.token_to_idx = dict()
if tokens is not None:
if "<unk>" not in tokens:
tokens = tokens +["<unk>"]
for token in tokens:
self.idx_to_token.append(token)
self.token_to_idx[token] = len(self.idx_to_token) - 1
self.unk = self.token_to_idx['<unk>']
@classmethod
def build(cls, text, min_freq=1, reserved_tokens=None):
token_freqs = defaultdict(int)
for sentence in text:
for token in sentence:
token_freqs[token] += 1
uniq_tokens = ["<unk>"] + (reserved_tokens if reserved_tokens else [])
uniq_tokens += [token for token, freq in token_freqs.items() \
if freq >= min_freq and token != "<unk>"]
return cls(uniq_tokens)
def __len__(self):
return len(self.idx_to_token)
def __getitem__(self, token):
return self.token_to_idx.get(token, self.unk)
def convert_tokens_to_ids(self, tokens):
return [self[token] for token in tokens]
def convert_ids_to_tokens(self, indices):
return [self.idx_to_token[index] for index in indices]
def save_vocab(vocab, path):
with open(path, 'w') as writer:
writer.write("\n".join(vocab.idx_to_token))
def read_vocab(path):
with open(path, 'r') as f:
tokens = f.read().split('\n')
return Vocab(tokens)
utils
import torch
from torch.utils.data import DataLoader, Dataset, TensorDataset
from vocab import Vocab
from nltk.corpus import reuters
BOS_TOKEN = "<bos>"
EOS_TOKEN = "<eos>"
PAD_TOKEN = "<pad>"
BOW_TOKEN = "<bow>"
EOW_TOKEN = "<eow>"
WEIGHT_INIT_RANGE = 0.1
def load_reuters():
text = reuters.sents()
text = [[word.lower() for word in sentence]
for sentence in text]
vocab = Vocab.build(text, reserved_tokens=[
PAD_TOKEN, BOS_TOKEN, EOS_TOKEN])
corpus = [vocab.convert_tokens_to_ids(
sentence) for sentence in text]
return corpus, vocab
def save_pretrained(vocab, embeds, save_path):
with open(save_path, "w") as writer:
writer.write(f"{embeds.shape[0]} {embeds.shape[1]}\n")
for idx, token in enumerate(vocab.idx_to_token):
vec = " ".join(["{:.4f}".format(x) for x in embeds[idx]])
writer.write(f"{token} {vec}\n")
print(f"Pretrained embeddings saved to:{save_path}")
def load_pretrained(load_path):
with open(load_path, "r") as fin:
n, d = map(int, fin.readline().split())
tokens = []
embeds = []
for line in fin:
line = line.rstrip().split(' ')
token, embeds = line[0], list(map(float, line[1:]))
tokens.append(token)
embeds.append(embeds)
vocab = Vocab(tokens)
embeds = torch.tensor(embeds, dtype=torch.float)
return vocab, embeds
def get_loader(dataset, batch_size, shuffle=True):
data_loader = DataLoader(
dataset,
batch_size=batch_size,
collate_fn=dataset.collate_fn,
shuffle=shuffle
)
return data_loader
def init_weights(model):
for name, param in model.named_parameters():
if "embedding" not in name:
torch.nn.init.uniform_(
param, a=-WEIGHT_INIT_RANGE, b=WEIGHT_INIT_RANGE)
|