问题描述:对英文书籍进行分类,类型一共有四种:恐怖,科幻,幽默和犯罪。
训练数据集:英文书籍的一部分文字及对应标签,以及书籍的id,保存在工作路径下。
样本举例如下:
使用的预训练模型:"bert-base-uncased",为啥选这个模型嘞,因为它比较小,毕竟用Transformer太费算力了,自己租GPU压力还是很大的(T▽T)
1 import使用的packages
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertConfig
from transformers import Trainer, TrainingArguments
import numpy as np
import random
from sklearn.model_selection import train_test_split
import json
from sklearn.metrics import f1_score
from functools import reduce
torch.cuda.current_device()
torch.cuda._initialized = True
2 读取数据,这一次有单独的测试数据,因此只把训练数据集分成了训练集和验证集两个部分
# 读取训练数据集
train_data = json.load(open("traindata.json", "r")) #读取后为dictionary格式
X = train_data['X'] #书籍部分文本
Y = train_data['Y'] #分类标签,共4类,分别对应0,1,2,3
docid = train_data['docid'] #书籍的id,注意训练数据中书籍会重复出现
# 读取测试数据
test_data = json.load(open("testdata.json", "r"))
Xt = test_data['X']
# 将训练数据集随机分成训练集和验证集,比例为8:2,并基于标签进行分层抽样
(X_train, X_val, Y_train, Y_val) = train_test_split(X, Y, test_size = 0.2, stratify = Y)
# 定义即将要使用的预训练模型
model_name = "bert-base-uncased"
# 设置一个最大的文本长度
max_length = 512
# 载入预训练模型的分词器
tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case = True)
# 用分词器来做encoding,词数少于最大文本长度的话用0填充
train_encodings = tokenizer(X_train, truncation = True, padding = True, max_length = max_length)
valid_encodings = tokenizer(X_val, truncation = True, padding = True, max_length = max_length)
# 应用torch.utils.data.Dataset类来生成数据集
class generate_dataset(torch.utils.data.Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
def __getitem__(self, index):
item = {key: torch.tensor(value[index]) for key, value in self.encodings.items()}
item["labels"] = torch.tensor([self.labels[index]])
return item
def __len__(self):
return len(self.labels)
# 生成训练集和验证集
train_dataset = generate_dataset(train_encodings, Y_train)
valid_dataset = generate_dataset(valid_encodings, Y_val)
3 本次应用的验证标准是macro_f1,定义一个函数来生成模型的macro f1值,用于transformers 的Trainer中
def compute_metrics(pred):
labels = pred.label_ids
preds = pred.predictions.argmax(-1)
macro_f1 = f1_score(labels, preds, average='macro') #用sklearn.metrics的f1_score来计算
return {'macro f1': macro_f1}
4 使用自定义的随机搜索函数来进行超参数的搜索
# 随机搜索需要输入一个搜索范围的参数,先进行定义
hyparams_grid = {
'num_train_epochs': [5, 8, 10, 12],
'learning_rate': [5e-6, 5e-5, 5e-4],
'warmup_steps': [500, 1000],
'weight_decay': [0.001, 0.01],
'hidden_dropout_prob': [0.1, 0.2, 0.3, 0.4, 0.5],
'attention_probs_dropout_prob': [0.1, 0.2, 0.3]
}
# 定义一个函数来计算输入的搜索范围可以生成的组合的最大数量,通过定义最大的搜索次数不能超过这个数量来避免重复搜索
def total_group_number(dictionary):
numbers_of_values = [len(value) for value in dictionary.values()]
gourp_number = reduce(lambda x, y: x * y, numbers_of_values)
return gourp_number
# 定义随机搜索函数,输入包括超参数的搜索范围,最大的搜索轮次和一个目标的macro f1值
def random_search(hyparams_grid, max_searching_round, val_target):
"""Given a searching grid of hyperparameters and the max searching round and target validation setting, find the best performing hyperparameter group within the searching grid.
Args:
hyparams_grid: a dictionary of hyperparameters.
max_searching_round: a singel int number, should be smaller than the possible group number of hyperparameters.
val_target: s single number.
Returns:
searching_results: a dictionary records results of all searching rounds.
best_val_result: a singel number, the best validation result.
best_model_hyparams: a dictionary of group of hyperparameters that get the best validation result.
"""
# 核验最大搜索次数是整数,且最大搜索次数小于等于超参数的所有组合数
assert type(max_searching_round) == int, 'max_searching_round should be a int.'
assert max_searching_round <= total_group_number(hyparams_grid), 'max_searching_round should be less than total possible number of hyperparameter groups.'
best_val_result = 0 # 用一个变量来记录最好的搜索结果
best_model_hyparams = dict() # 用一个字典来记录最好的搜索结果对应的参数组合
searching_round = 0 # 用一个变量来记录搜索轮次
searching_results = dict() # 用一个字典来记录每一个轮次的搜索结果
# 循环搜索直到搜索次数达到了定义的最大搜索次数,或者最优的搜索结果达到了目标值
while searching_round < max_searching_round and best_val_result < val_target:
searching_round += 1 # 搜索次数+1
print(f'Start searching round: {searching_round}')
# 随机选择一组超参数
random_select_hyparams = dict()
random_select_hyparams = {key: random.sample(value, 1)[0] for key, value in hyparams_grid.items()}
# 如果随机选择的超参数已经被搜索过了,那么重新进行选择直到选择了一组新的超参数
while random_select_hyparams in [value['Hyperparameters'] for value in searching_results.values()]:
random_select_hyparams = {key: random.sample(value, 1)[0] for key, value in hyparams_grid.items()}
print('-------------------------------------------')
print(f'Random Selected Hyperparameters: {random_select_hyparams}')
# 加载模型
config = BertConfig.from_pretrained(model_name, num_labels = 4,
hidden_dropout_prob = random_select_hyparams['hidden_dropout_prob'],
attention_probs_dropout_prob = random_select_hyparams['attention_probs_dropout_prob']
)
model = BertForSequenceClassification.from_pretrained(model_name, config = config).to("cuda")
# 设置训练参数
training_args = TrainingArguments(output_dir = './results', # output directory
num_train_epochs = random_select_hyparams['num_train_epochs'], #训练轮次
warmup_steps = random_select_hyparams['warmup_steps'], #学习率warmup的步骤数
learning_rate = random_select_hyparams['learning_rate'], #初始学习率
weight_decay = random_select_hyparams['weight_decay'], #衰减权重
per_device_train_batch_size = 8, #训练批大小
per_device_eval_batch_size = 16, #评估批大小
logging_dir = './logs', #log的存储目录
load_best_model_at_end = True, #训练结束后加载最优的模型
evaluation_strategy = "steps",
seed = 42
)
# 设置trainer
trainer = Trainer(model = model,
args = training_args,
train_dataset = train_dataset,
eval_dataset = valid_dataset,
compute_metrics = compute_metrics
)
# 应用当前选择的超参数组合训练模型
trainer.train()
# 评估模型
val_results = trainer.evaluate()
val_result = val_results['eval_macro f1']
searching_results['Round'+ str(searching_round)] = {'Hyperparameters': random_select_hyparams, 'Validation results': val_results} #记录搜索结果
if val_result > best_val_result:
best_val_result = val_result
best_model_hyparams = random_select_hyparams
print(f'End of round {searching_round}, Validation results: {val_results}')
print(f'Until now Best macro f1: {best_val_result}')
print('-------------------------------------------')
print(f'End of searching, total searching round: {searching_round}, Best macro f1: {best_val_result}, Best hyperparameters: {best_model_hyparams}')
return searching_results, best_val_result, best_model_hyparams
5 开始超参数搜索~
最大搜索次数设置在了5,目标是0.85(这个目标过于乐观,但反正有最大搜索次数兜底)
searching_results, best_val_result, best_model_hyparams = random_search(hyparams_grid, max_searching_round = 5, val_target = 0.85)
6 最后用所有的训练数据在搜索出的最优超参数组合下进行模型训练,并对测试数据的分类进行预测
在生成测试数据集的时候,用了一列假的标签列来生成测试数据集,因为不知道怎么在标签为空的情况下生成测试集并应用在trainer中,知道怎么做的朋友们还请不吝赐教一下~先谢过~抱拳~~
# 重新生成训练数据集
X_encodings = tokenizer(X, truncation = True, padding = True, max_length = max_length)
X_dataset = generate_dataset(X_encodings, Y)
# 训练模型
config = BertConfig.from_pretrained(model_name, num_labels = 4, hidden_dropout_prob = best_model_hyparams['hidden_dropout_prob'])
model = BertForSequenceClassification.from_pretrained(model_name, config = config).to("cuda")
training_args = TrainingArguments(output_dir = './results',
num_train_epochs = best_model_hyparams['num_train_epochs'],
warmup_steps = best_model_hyparams['warmup_steps'],
learning_rate = best_model_hyparams['learning_rate'],
weight_decay = best_model_hyparams['weight_decay'],
per_device_train_batch_size = 8,
per_device_eval_batch_size = 16,
logging_dir = './logs',
seed = 42
)
trainer = Trainer(model = model,
args = training_args,
train_dataset = X_dataset,
compute_metrics = compute_metrics
)
trainer.train()
# 生成测试数据集
test_encodings = tokenizer(Xt, truncation = True, padding = True, max_length = max_length)
Fake_test_labels = list(np.repeat(0,len(Xt)))
test_dataset = generate_dataset(test_encodings, Fake_test_labels)
# 进行预测
pred_test = trainer.predict(test_dataset)
Y_pred = pred_test.predictions
Y_test_pred = np.argmax(Y_pred, axis = 1)
最后的测试结果分享~
最终测试的macro f1为0.72+,在200多名同学中排40多名,因为开始时候有一个目标的值是0.71+,因此当我达到了这个目标之后我就没有再继续搜索了,其实真的是没有花充足的时间去进行超参数的搜索,主要还是因为GPU太贵了o(╥﹏╥)o,相信增加搜索轮次的话还是有希望再提升一步的~~
就分享到这里啦,祝大家都训练出理想的模型!
|