接着分析data_collator.py之中的__call__函数的后续的内容
if self.mlm:
batch["input_ids"], batch["labels"] = self.mask_tokens(
batch["input_ids"], special_tokens_mask=special_tokens_mask
)
这里面需要进入self.mask_tokens去调用
def mask_tokens(
self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
print('data/data_collator.py mask_tokens')
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
labels = inputs.clone()
r"""
labels = tensor(
[[ 101, 169, 107, ..., 10539, 107, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
...,
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 117, 169, 102]])
"""
probability_matrix = torch.full(labels.shape, self.mlm_probability)
r"""
probability_matrix =
tensor([[0.1500,0.1500,...],
[0.1500,0.1500,...],
..................
"""
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
return inputs, labels
首先复制一下对应的labels的值
labels = inputs.clone()
labels = tensor(
[[ 101, 169, 107, ..., 10539, 107, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
...,
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 117, 169, 102]]
)
接着调用对应的probability_matrix矩阵
probability_matrix = torch.full(labels.shape,self.mlm_probability)
得到的对应的probability_matrix矩阵
probability_matrix =
tensor([[0.1500,0.1500,...],
[0.1500,0.1500,...],
..................
[0.1500,0.1500,...]])
接下来查看对于masked_indices的调用
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
print('special_tokens_mask1 = ')
print(special_tokens_mask)
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
print('special_tokens_mask2 = ')
print(special_tokens_mask)
else:
special_tokens_mask = special_tokens_mask.bool()
得到对应的内容为
special_tokens_mask1 =
[[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
.....................
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1]]
对应的special_tokens_mask2的内容为
special_tokens_mask2 =
tensor([[ True, False, False, ..., False, False, True],
...,
[ True, False, False, ..., False, False, True]])
这里调用special_tokens_mask1需要调用get_special_tokens_mask的函数内容
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val,already_head_special_tokens=True) for valu in labels.tolist()
]
进入到self.tokenizer.get_special_tokens_mask函数之中去查看,这里的self.tokenizer指向的是PreTrainedTokenizer类别的内容 查看Transformer/tokenization_utils/PreTrainedTokenizer.py的文件内容
if isinstnace(examples[0], (dict,BatchEncoding)):
batch = self.tokenizer.pad(examples,return_tensors="pt",pad_to_multiple_of
print('|||self.tokenizer = |||')
print(self.tokenizer)
|||self.tokenizer = |||
PreTrainedTokenizer is_fast
PreTrainedTokenizer is_fast
PreTrainedTokenizer(name_or_path='/home/xiaoguzai/数据/nezha-chinese-base/vocab.txt', vocab_size=21128, model_max_len=1000000000000000019884624838656, is_fast=False, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})
首先这里PreTrainedTokenizer必须调用的是is_fast函数的内容,因为这里的is_fast被标注为@property
@property
def is_fast(self) -> bool:
return False
这里不知道为何调用is_fast函数的内容,猜想这里的is_fast函数内容是函数初始化进行调用的
self.pad_to_multiple_of = None
这里调用的是PreTrainedTokenizer.pad,同时从tokenization_utils_base之中能够调用出PreTrainedTokenizerBase之中的pad函数
batch =
{'input_ids':tensor(
[[101,169,...102],
........................
[101,169,...102]])
'attention_mask':tensor(
[[1,1,...1,1]
...............
[1,1,...1,1]]
)
这里先总览一下对应的batch内容
batch =
{'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 107, 10539, 102],
...,
[ 101, 169, 107, ..., 131, 107, 102],
[ 101, 169, 103, ..., 100, 124, 102],
[ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
...,
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, -100, ..., -100, -100, -100],
...,
[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, 107, ..., -100, -100, -100],
[-100, -100, 107, ..., -100, -100, -100]])}
然后应该是这波数据进入model之中进行训练 发现这里输入的预训练的内容仍然为:
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
outputs = (prediction_scores,) + outputs[2:]
masked_lm_labels = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
return outputs
最后阅读一波关于在预训练之中使用的损失函数的代码内容
outputs = self.bert(
input_ids,
attention_mask=attention_mask,
token_type_ids=token_type_ids,
head_mask=head_mask,
inputs_embeds=inputs_embeds,
encoder_hidden_states=encoder_hidden_states,
encoder_attention_mask=encoder_attention_mask,
)
sequence_output = outputs[0]
prediction_scores = self.cls(sequence_output)
outputs = (prediction_scores,) + outputs[2:]
masked_lm_labels = None
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
这里的self.bert的内容为NeZhaModel的内容(带有cls网络层),接着调用的内容为
prediction_scores = self.cls(sequence_output)
这里调用的网络层内容为
self.cls = BertOnlyMLMHead(config)
进入到BertOnlyMLMHead(config)网络层之中查看调用过程
class BertOnlyMLMHead(nn.Module):
def __init__(self, config):
super().__init__()
self.predictions = BertLMPredictionHead(config)
def forward(self, sequence_output):
prediction_scores = self.predictions(sequence_output)
return prediction_scores
这里需要调用BertLMPredictionHead的网络层,进入到BertLMPredictionHead网络层之中
class BertPredictionHeadTransform(nn.Module):
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
if isinstance(config.hidden_act, str):
self.transform_act_fn = ACT2FN[config.hidden_act]
else:
print('BertPredictionHeadTransform situation2')
self.transform_act_fn = config.hidden_act
self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
def forward(self, hidden_states):
hidden_states = self.dense(hidden_states)
hidden_states = self.transform_act_fn(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
return hidden_states
class BertLMPredictionHead(nn.Module):
def __init__(self, config):
super().__init__()
self.transform = BertPredictionHeadTransform(config)
self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
self.bias = nn.Parameter(torch.zeros(config.vocab_size))
self.decoder.bias = self.bias
def forward(self, hidden_states):
hidden_states = self.transform(hidden_states)
hidden_states = self.decoder(hidden_states)
return hidden_states
完整地调用网络层的过程如下:
Linear(config.hidden_size,config.hidden_size)
'gelu'激活函数
LayerNorm(0.1)
Linear(config.hidden_size,config.vocab_size)
(最后一个Linear的bias=zeros)
最后调用相应的CrossEntropy()损失函数的内容
if labels is not None:
loss_fct = CrossEntropyLoss()
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
outputs = (masked_lm_loss,) + outputs
!!!注意这里的对应的modeling.py的内容一定要读pretrain_code目录下面的modeling下面的modeling_nezha下面的modeling.py的内容 也就是说,这里本质上是求
batch =
{'input_ids': tensor([[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 100, 100, 102],
[ 101, 169, 107, ..., 107, 10539, 102],
...,
[ 101, 169, 107, ..., 131, 107, 102],
[ 101, 169, 103, ..., 100, 124, 102],
[ 101, 169, 107, ..., 171, 117, 102]]), 'attention_mask': tensor([[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
...,
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1],
[1, 1, 1, ..., 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, -100, ..., -100, -100, -100],
...,
[-100, -100, -100, ..., -100, -100, -100],
[-100, -100, 107, ..., -100, -100, -100],
[-100, -100, 107, ..., -100, -100, -100]])}
中由input_ids输出的prediction_scores.view = ([2880,21128])和labels.view(-1) = ([2880])的好多是-100的计算对应的cross_entropy交叉熵概率, 由于没有-100这个标签,所以带有-100的标签交叉熵计算出来的概率为0,这样就剔除了这一部分的误差内容
|