先给出代码框架(data在最后有链接)
utils.py
from sklearn.utils import shuffle
import pickle
def read_TREC():
data = {}
def read(mode):
x, y = [], []
with open("data/TREC/TREC_" + mode + ".txt", "r", encoding="utf-8") as f:
for line in f:
if line[-1] == "\n":
line = line[:-1]
y.append(line.split()[0].split(":")[0])
x.append(line.split()[1:])
x, y = shuffle(x, y)
if mode == "train":
dev_idx = len(x) // 10
data["dev_x"], data["dev_y"] = x[:dev_idx], y[:dev_idx]
data["train_x"], data["train_y"] = x[dev_idx:], y[dev_idx:]
else:
data["test_x"], data["test_y"] = x, y
read("train")
read("test")
return data
def read_MR():
data = {}
x, y = [], []
with open("data/MR/rt-polarity.pos", "r", encoding="utf-8") as f:
for line in f:
if line[-1] == "\n":
line = line[:-1]
x.append(line.split())
y.append(1)
with open("data/MR/rt-polarity.neg", "r", encoding="utf-8") as f:
for line in f:
if line[-1] == "\n":
line = line[:-1]
x.append(line.split())
y.append(0)
x, y = shuffle(x, y)
dev_idx = len(x) // 10 * 8
test_idx = len(x) // 10 * 9
data["train_x"], data["train_y"] = x[:dev_idx], y[:dev_idx]
data["dev_x"], data["dev_y"] = x[dev_idx:test_idx], y[dev_idx:test_idx]
data["test_x"], data["test_y"] = x[test_idx:], y[test_idx:]
return data
def save_model(model, params):
path = f"saved_models/{params['DATASET']}_{params['MODEL']}_{params['EPOCH']}.pkl"
pickle.dump(model, open(path, "wb"))
print(f"A model is saved successfully as {path}!")
def load_model(params):
path = f"saved_models/{params['DATASET']}_{params['MODEL']}_{params['EPOCH']}.pkl"
try:
model = pickle.load(open(path, "rb"))
print(f"Model in {path} loaded successfully!")
return model
except:
print(f"No available model such as {path}.")
exit()
model.py
import torch
import torch.nn as nn
import torch.nn.functional as F
class CNN(nn.Module):
def __init__(self, **kwargs):
super(CNN, self).__init__()
self.MODEL = kwargs["MODEL"]
self.BATCH_SIZE = kwargs["BATCH_SIZE"]
self.MAX_SENT_LEN = kwargs["MAX_SENT_LEN"]
self.WORD_DIM = kwargs["WORD_DIM"]
self.VOCAB_SIZE = kwargs["VOCAB_SIZE"]
self.CLASS_SIZE = kwargs["CLASS_SIZE"]
self.FILTERS = kwargs["FILTERS"]
self.FILTER_NUM = kwargs["FILTER_NUM"]
self.DROPOUT_PROB = kwargs["DROPOUT_PROB"]
self.IN_CHANNEL = 1
assert (len(self.FILTERS) == len(self.FILTER_NUM))
self.embedding = nn.Embedding(self.VOCAB_SIZE + 2, self.WORD_DIM, padding_idx=self.VOCAB_SIZE + 1)
if self.MODEL == "static" or self.MODEL == "non-static" or self.MODEL == "multichannel":
self.WV_MATRIX = kwargs["WV_MATRIX"]
self.embedding.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
if self.MODEL == "static":
self.embedding.weight.requires_grad = False
elif self.MODEL == "multichannel":
self.embedding2 = nn.Embedding(self.VOCAB_SIZE + 2, self.WORD_DIM, padding_idx=self.VOCAB_SIZE + 1)
self.embedding2.weight.data.copy_(torch.from_numpy(self.WV_MATRIX))
self.embedding2.weight.requires_grad = False
self.IN_CHANNEL = 2
for i in range(len(self.FILTERS)):
conv = nn.Conv1d(self.IN_CHANNEL, self.FILTER_NUM[i], self.WORD_DIM * self.FILTERS[i], stride=self.WORD_DIM)
setattr(self, f'conv_{i}', conv)
self.fc = nn.Linear(sum(self.FILTER_NUM), self.CLASS_SIZE)
def get_conv(self, i):
return getattr(self, f'conv_{i}')
def forward(self, inp):
x = self.embedding(inp).view(-1, 1, self.WORD_DIM * self.MAX_SENT_LEN)
if self.MODEL == "multichannel":
x2 = self.embedding2(inp).view(-1, 1, self.WORD_DIM * self.MAX_SENT_LEN)
x = torch.cat((x, x2), 1)
conv_results = [
F.max_pool1d(F.relu(self.get_conv(i)(x)), self.MAX_SENT_LEN - self.FILTERS[i] + 1)
.view(-1, self.FILTER_NUM[i])
for i in range(len(self.FILTERS))]
x = torch.cat(conv_results, 1)
x = F.dropout(x, p=self.DROPOUT_PROB, training=self.training)
x = self.fc(x)
return x
run.py
import datetime
import time
start_dt = datetime.datetime.now()
print("start_datetime:", start_dt)
time.sleep(2)
for i in range(10000):
i += 1
from model import CNN
import utils
from torch.autograd import Variable
import torch
import torch.optim as optim
import torch.nn as nn
from sklearn.utils import shuffle
from gensim.models.keyedvectors import KeyedVectors
import numpy as np
import argparse
import copy
def train(data, params):
if params["MODEL"] != "rand":
print("loading word2vec...")
word_vectors = KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin", binary=True)
wv_matrix = []
for i in range(len(data["vocab"])):
word = data["idx_to_word"][i]
if word in word_vectors.vocab:
wv_matrix.append(word_vectors.word_vec(word))
else:
wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
wv_matrix.append(np.random.uniform(-0.01, 0.01, 300).astype("float32"))
wv_matrix.append(np.zeros(300).astype("float32"))
wv_matrix = np.array(wv_matrix)
params["WV_MATRIX"] = wv_matrix
model = CNN(**params)
parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = optim.Adadelta(parameters, params["LEARNING_RATE"])
criterion = nn.CrossEntropyLoss()
pre_dev_acc = 0
max_dev_acc = 0
max_test_acc = 0
for e in range(params["EPOCH"]):
data["train_x"], data["train_y"] = shuffle(data["train_x"], data["train_y"])
for i in range(0, len(data["train_x"]), params["BATCH_SIZE"]):
batch_range = min(params["BATCH_SIZE"], len(data["train_x"]) - i)
batch_x = [[data["word_to_idx"][w] for w in sent] +
[params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(sent))
for sent in data["train_x"][i:i + batch_range]]
batch_y = [data["classes"].index(c) for c in data["train_y"][i:i + batch_range]]
batch_x = Variable(torch.LongTensor(batch_x))
batch_y = Variable(torch.LongTensor(batch_y))
optimizer.zero_grad()
model.train()
pred = model(batch_x)
loss = criterion(pred, batch_y)
loss.backward()
nn.utils.clip_grad_norm_(parameters, max_norm=params["NORM_LIMIT"])
optimizer.step()
dev_acc = test(data, model, params, mode="dev")
test_acc = test(data, model, params)
print("epoch:", e + 1, "/ dev_acc:", dev_acc, "/ test_acc:", test_acc)
if params["EARLY_STOPPING"] and dev_acc <= pre_dev_acc:
print("early stopping by dev_acc!")
break
else:
pre_dev_acc = dev_acc
if dev_acc > max_dev_acc:
max_dev_acc = dev_acc
max_test_acc = test_acc
best_model = copy.deepcopy(model)
print("max dev acc:", max_dev_acc, "test acc:", max_test_acc)
return best_model
def test(data, model, params, mode="test"):
model.eval()
if mode == "dev":
x, y = data["dev_x"], data["dev_y"]
elif mode == "test":
x, y = data["test_x"], data["test_y"]
x = [[data["word_to_idx"][w] if w in data["vocab"] else params["VOCAB_SIZE"] for w in sent] +
[params["VOCAB_SIZE"] + 1] * (params["MAX_SENT_LEN"] - len(sent))
for sent in x]
x = Variable(torch.LongTensor(x))
y = [data["classes"].index(c) for c in y]
pred = np.argmax(model(x).cpu().data.numpy(), axis=1)
acc = sum([1 if p == y else 0 for p, y in zip(pred, y)]) / len(pred)
return acc
def main():
parser = argparse.ArgumentParser(description="-----[CNN-classifier]-----")
parser.add_argument("--mode", default="train", help="train: train (with test) a model / test: test saved models")
parser.add_argument("--model", default="rand", help="available models: rand, static, non-static, multichannel")
parser.add_argument("--dataset", default="TREC", help="available datasets: MR, TREC")
parser.add_argument("--save_model", default=True, action='store_true', help="whether saving model or not")
parser.add_argument("--early_stopping", default=False, action='store_true', help="whether to apply early stopping")
parser.add_argument("--epoch", default=100, type=int, help="number of max epoch")
parser.add_argument("--learning_rate", default=1e-3, type=float, help="learning rate")
options = parser.parse_args()
data = getattr(utils, f"read_{options.dataset}")()
data["vocab"] = sorted(list(set([w for sent in data["train_x"] + data["dev_x"] + data["test_x"] for w in sent])))
data["classes"] = sorted(list(set(data["train_y"])))
data["word_to_idx"] = {w: i for i, w in enumerate(data["vocab"])}
data["idx_to_word"] = {i: w for i, w in enumerate(data["vocab"])}
params = {
"MODEL": options.model,
"DATASET": options.dataset,
"SAVE_MODEL": options.save_model,
"EARLY_STOPPING": options.early_stopping,
"EPOCH": options.epoch,
"LEARNING_RATE": options.learning_rate,
"MAX_SENT_LEN": max([len(sent) for sent in data["train_x"] + data["dev_x"] + data["test_x"]]),
"BATCH_SIZE": 50,
"WORD_DIM": 300,
"VOCAB_SIZE": len(data["vocab"]),
"CLASS_SIZE": len(data["classes"]),
"FILTERS": [3, 4, 5],
"FILTER_NUM": [100, 100, 100],
"DROPOUT_PROB": 0.5,
"NORM_LIMIT": 3,
}
print("=" * 20 + "INFORMATION" + "=" * 20)
print("MODEL:", params["MODEL"])
print("DATASET:", params["DATASET"])
print("VOCAB_SIZE:", params["VOCAB_SIZE"])
print("EPOCH:", params["EPOCH"])
print("LEARNING_RATE:", params["LEARNING_RATE"])
print("EARLY_STOPPING:", params["EARLY_STOPPING"])
print("SAVE_MODEL:", params["SAVE_MODEL"])
print("=" * 20 + "INFORMATION" + "=" * 20)
if options.mode == "train":
print("=" * 20 + "TRAINING STARTED" + "=" * 20)
model = train(data, params)
if params["SAVE_MODEL"]:
utils.save_model(model, params)
print("=" * 20 + "TRAINING FINISHED" + "=" * 20)
else:
model = utils.load_model(params)
test_acc = test(data, model, params)
print("test acc:", test_acc)
if __name__ == "__main__":
main()
end_dt = datetime.datetime.now()
print("end_datetime:", end_dt)
print("time cost:", (end_dt - start_dt).seconds, "s")
运行结果
这里可以看到运行了100轮,花费时间35分钟,准确率77.8%,还算不错,这是一共有16个类,我觉得是很不错了!!
data
链接:https://pan.baidu.com/s/1sfoCh7qYsIHVn3ee3lrwlw 提取码:2933
|