diff --git a/examples/lm_finetuning/pregenerate_training_data.py b/examples/lm_finetuning/pregenerate_training_data.py index 498ab22333..8cc28d2e78 100644 --- a/examples/lm_finetuning/pregenerate_training_data.py +++ b/examples/lm_finetuning/pregenerate_training_data.py @@ -49,7 +49,7 @@ class DocumentDatabase: self._precalculate_doc_weights() rand_start = self.doc_cumsum[current_idx] rand_end = rand_start + self.cumsum_max - self.doc_lengths[current_idx] - sentence_index = randint(rand_start, rand_end) % self.cumsum_max + sentence_index = randint(rand_start, rand_end-1) % self.cumsum_max sampled_doc_index = np.searchsorted(self.doc_cumsum, sentence_index, side='right') else: # If we don't use sentence weighting, then every doc has an equal chance to be chosen diff --git a/pytorch_pretrained_bert/modeling_gpt2.py b/pytorch_pretrained_bert/modeling_gpt2.py index c381b288f8..7b00ce7730 100644 --- a/pytorch_pretrained_bert/modeling_gpt2.py +++ b/pytorch_pretrained_bert/modeling_gpt2.py @@ -617,8 +617,14 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): hidden_states, presents = self.transformer(input_ids, position_ids, token_type_ids, past) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() + + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) return loss return lm_logits, presents @@ -690,8 +696,11 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) diff --git a/pytorch_pretrained_bert/modeling_openai.py b/pytorch_pretrained_bert/modeling_openai.py index 7bf643675e..fb3d0cadb7 100644 --- a/pytorch_pretrained_bert/modeling_openai.py +++ b/pytorch_pretrained_bert/modeling_openai.py @@ -714,8 +714,14 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): hidden_states = self.transformer(input_ids, position_ids, token_type_ids) lm_logits = self.lm_head(hidden_states) if lm_labels is not None: + # Shift so that tokens < n predict n + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() + + # Flatten the tokens loss_fct = CrossEntropyLoss(ignore_index=-1) - loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1)) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) return loss return lm_logits @@ -801,8 +807,11 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids) losses = [] if lm_labels is not None: + shift_logits = lm_logits[:, :-1].contiguous() + shift_labels = lm_labels[:, 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) - losses.append(loss_fct(lm_logits.view(-1, lm_logits.size(-1)), lm_labels.view(-1))) + losses.append(loss_fct(shift_logits.view(-1, + shift_logits.size(-1)), shift_labels.view(-1))) if mc_labels is not None: loss_fct = CrossEntropyLoss() losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1))) diff --git a/pytorch_pretrained_bert/optimization.py b/pytorch_pretrained_bert/optimization.py index fa911e5c04..aa59c7d7ec 100644 --- a/pytorch_pretrained_bert/optimization.py +++ b/pytorch_pretrained_bert/optimization.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + x_ = (x - warmup) / (1 - warmup) # progress after warmup - + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. diff --git a/pytorch_pretrained_bert/optimization_openai.py b/pytorch_pretrained_bert/optimization_openai.py index 7f56a1284c..99ac15e108 100644 --- a/pytorch_pretrained_bert/optimization_openai.py +++ b/pytorch_pretrained_bert/optimization_openai.py @@ -26,7 +26,8 @@ logger = logging.getLogger(__name__) def warmup_cosine(x, warmup=0.002): if x < warmup: return x/warmup - return 0.5 * (1.0 + torch.cos(math.pi * x)) + x_ = (x - warmup) / (1 - warmup) # progress after warmup + return 0.5 * (1. + math.cos(math.pi * x_)) def warmup_constant(x, warmup=0.002): """ Linearly increases learning rate over `warmup`*`t_total` (as provided to OpenAIAdam) training steps.