adding examples for openai and transformer-xl
This commit is contained in:
parent
9c3c24800b
commit
d482e3d79d
|
@ -0,0 +1,344 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" OpenAI GPT model fine-tuning script.
|
||||
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
||||
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
||||
|
||||
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset
|
||||
"""
|
||||
import argparse
|
||||
import os
|
||||
import csv
|
||||
import random
|
||||
import logging
|
||||
from tqdm import tqdm
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
|
||||
TensorDataset)
|
||||
from sklearn.metrics import accuracy_score
|
||||
from sklearn.utils import shuffle
|
||||
|
||||
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer, OpenAIAdam
|
||||
|
||||
# from analysis import rocstories as rocstories_analysis
|
||||
# from datasets import rocstories
|
||||
# from model_pytorch import DoubleHeadModel, load_openai_pretrained_model
|
||||
# from opt import OpenAIAdam
|
||||
# from text_utils import TextEncoder
|
||||
# from utils import (encode_dataset, iter_data,
|
||||
# ResultLogger, make_path)
|
||||
# from loss import MultipleChoiceLossCompute
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def iter_apply(Xs, Ms, Ys):
|
||||
# fns = [lambda x: np.concatenate(x, 0), lambda x: float(np.sum(x))]
|
||||
logits = []
|
||||
cost = 0
|
||||
with torch.no_grad():
|
||||
dh_model.eval()
|
||||
for xmb, mmb, ymb in iter_data(Xs, Ms, Ys, n_batch=n_batch_train, truncate=False, verbose=True):
|
||||
n = len(xmb)
|
||||
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
||||
YMB = torch.tensor(ymb, dtype=torch.long).to(device)
|
||||
MMB = torch.tensor(mmb).to(device)
|
||||
_, clf_logits = dh_model(XMB)
|
||||
clf_logits *= n
|
||||
clf_losses = compute_loss_fct(XMB, YMB, MMB, clf_logits, only_return_losses=True)
|
||||
clf_losses *= n
|
||||
logits.append(clf_logits.to("cpu").numpy())
|
||||
cost += clf_losses.sum().item()
|
||||
logits = np.concatenate(logits, 0)
|
||||
return logits, cost
|
||||
|
||||
|
||||
def iter_predict(Xs, Ms):
|
||||
logits = []
|
||||
with torch.no_grad():
|
||||
dh_model.eval()
|
||||
for xmb, mmb in iter_data(Xs, Ms, n_batch=n_batch_train, truncate=False, verbose=True):
|
||||
n = len(xmb)
|
||||
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
||||
MMB = torch.tensor(mmb).to(device)
|
||||
_, clf_logits = dh_model(XMB)
|
||||
logits.append(clf_logits.to("cpu").numpy())
|
||||
logits = np.concatenate(logits, 0)
|
||||
return logits
|
||||
|
||||
|
||||
def log(save_dir, desc):
|
||||
global best_score
|
||||
print("Logging")
|
||||
tr_logits, tr_cost = iter_apply(trX[:n_valid], trM[:n_valid], trY[:n_valid])
|
||||
va_logits, va_cost = iter_apply(vaX, vaM, vaY)
|
||||
tr_cost = tr_cost / len(trY[:n_valid])
|
||||
va_cost = va_cost / n_valid
|
||||
tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100.
|
||||
va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100.
|
||||
logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc)
|
||||
print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc))
|
||||
if submit:
|
||||
score = va_acc
|
||||
if score > best_score:
|
||||
best_score = score
|
||||
path = os.path.join(save_dir, desc, 'best_params')
|
||||
torch.save(dh_model.state_dict(), make_path(path))
|
||||
|
||||
|
||||
def predict(dataset, submission_dir):
|
||||
filename = filenames[dataset]
|
||||
pred_fn = pred_fns[dataset]
|
||||
label_decoder = label_decoders[dataset]
|
||||
predictions = pred_fn(iter_predict(teX, teM))
|
||||
if label_decoder is not None:
|
||||
predictions = [label_decoder[prediction] for prediction in predictions]
|
||||
path = os.path.join(submission_dir, filename)
|
||||
os.makedirs(os.path.dirname(path), exist_ok=True)
|
||||
with open(path, 'w') as f:
|
||||
f.write('{}\t{}\n'.format('index', 'prediction'))
|
||||
for i, prediction in enumerate(predictions):
|
||||
f.write('{}\t{}\n'.format(i, prediction))
|
||||
|
||||
|
||||
def run_epoch():
|
||||
for xmb, mmb, ymb in iter_data(*shuffle(trX, trM, trYt, random_state=np.random),
|
||||
n_batch=n_batch_train, truncate=True, verbose=True):
|
||||
global n_updates
|
||||
dh_model.train()
|
||||
XMB = torch.tensor(xmb, dtype=torch.long).to(device)
|
||||
YMB = torch.tensor(ymb, dtype=torch.long).to(device)
|
||||
MMB = torch.tensor(mmb).to(device)
|
||||
lm_logits, clf_logits = dh_model(XMB)
|
||||
compute_loss_fct(XMB, YMB, MMB, clf_logits, lm_logits)
|
||||
n_updates += 1
|
||||
if n_updates in [1000, 2000, 4000, 8000, 16000, 32000] and n_epochs == 0:
|
||||
log(save_dir, desc)
|
||||
|
||||
|
||||
def accuracy(out, labels):
|
||||
outputs = np.argmax(out, axis=1)
|
||||
return np.sum(outputs == labels)
|
||||
|
||||
def load_rocstories_dataset(dataset_path):
|
||||
""" Output a list of tuples(story, 1st continuation, 2nd continuation, label) """
|
||||
with open(dataset_path, encoding='utf_8') as f:
|
||||
f = csv.reader(f)
|
||||
output = []
|
||||
next(f) # skip the first line
|
||||
for line in tqdm(f):
|
||||
output.append((' '.join(line[1:5]), line[5], line[6], int(line[-1])-1))
|
||||
return output
|
||||
|
||||
def pre_process_dataset(encoded_dataset, max_len, start_token, delimiter_token, clf_token):
|
||||
n_batch = len(dataset)
|
||||
input_ids = np.zeros((n_batch, 2, max_len), dtype=np.int32)
|
||||
mc_token_mask = np.zeros((n_batch, 2, max_len), dtype=np.int32)
|
||||
lm_labels = np.full((n_batch, 2, max_len), -1, dtype=np.float32)
|
||||
mc_labels = np.zeros((n_batch,), dtype=np.float32)
|
||||
for i, (story, cont1, cont2, mc_label), in enumerate(encoded_dataset):
|
||||
with_cont1 = [start_token] + story[:max_len] + [delimiter_token] + cont1[:max_len] + [clf_token]
|
||||
with_cont2 = [start_token] + story[:max_len] + [delimiter_token] + cont2[:max_len] + [clf_token]
|
||||
xmb[i, 0, :len(with_cont1)] = with_cont1
|
||||
xmb[i, 1, :len(with_cont2)] = with_cont2
|
||||
mc_token_mask[i, 0, len(with_cont1) - 1] = 1
|
||||
lm_labels[i, 0, :len(with_cont1)-1] = with_cont1[1:]
|
||||
lm_labels[i, 1, :len(with_cont2)-1] = with_cont2[1:]
|
||||
mc_labels[i] = mc_label
|
||||
all_inputs = (input_ids, mc_token_mask, lm_labels, mc_labels)
|
||||
all_input_tensors = list(torch.tensor(t) for t in all_inputs)
|
||||
return all_input_tensors
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model_name', type=str, default='openai-gpt',
|
||||
help='pretrained model name')
|
||||
parser.add_argument('--data_dir', type=str, default='data/')
|
||||
parser.add_argument('--seed', type=int, default=42)
|
||||
parser.add_argument('--num_train_epochs', type=int, default=3)
|
||||
parser.add_argument('--train_batch_size', type=int, default=8)
|
||||
parser.add_argument('--max_grad_norm', type=int, default=1)
|
||||
parser.add_argument('--learning_rate', type=float, default=6.25e-5)
|
||||
parser.add_argument('--warmup_proportion', type=float, default=0.002)
|
||||
parser.add_argument('--max_grad_norm', type=float, default=1)
|
||||
parser.add_argument('--lr_schedule', type=str, default='warmup_linear')
|
||||
parser.add_argument('--weight_decay', type=float, default=0.01)
|
||||
parser.add_argument('--lm_coef', type=float, default=0.5)
|
||||
parser.add_argument('--n_valid', type=int, default=374)
|
||||
args = parser.parse_args()
|
||||
print(args)
|
||||
|
||||
random.seed(args.seed)
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
n_gpu = torch.cuda.device_count()
|
||||
logger.info("device", device, "n_gpu", n_gpu)
|
||||
|
||||
# Load tokenizer and model
|
||||
# This loading functions also add new tokens and embeddings called `special tokens`
|
||||
# These new embeddings will be fine-tuned on the RocStories dataset
|
||||
special_tokens = ['_start_', '_delimiter_', '_classify_']
|
||||
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens)
|
||||
special_tokens_ids = list(tokenizer.convert_tokens_to_ids(token) for token in special_tokens)
|
||||
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens))
|
||||
|
||||
# Load the dataset and prepare the inputs
|
||||
logger.info("Encoding dataset...")
|
||||
dataset = load_rocstories_dataset(args.dataset_path)
|
||||
tokenized_dataset = list(list(tokenizer.tokenize(x) for x in instance) for instance in dataset)
|
||||
encoded_dataset = list(list(tokenizer.convert_tokens_to_ids(x) for x in instance) for instance in tokenized_dataset)
|
||||
|
||||
max_input_length = max(len(story)+max(len(cont1), len(cont2))+3 for story, cont1, cont2, _ in encoded_dataset)
|
||||
max_input_length = min(max_input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
||||
max_sub_part_length = max_input_length // 2 - 2
|
||||
|
||||
# Prepare dataloader
|
||||
dataset_tensors = pre_process_dataset(encoded_dataset, max_sub_part_length, *special_tokens_ids)
|
||||
train_data = TensorDataset(*dataset_tensors)
|
||||
train_sampler = RandomSampler(train_data)
|
||||
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
||||
|
||||
# Prepare optimizer
|
||||
param_optimizer = list(model.named_parameters())
|
||||
no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
|
||||
optimizer_grouped_parameters = [
|
||||
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
|
||||
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
|
||||
]
|
||||
num_train_optimization_steps = len(train_data) // args.train_batch_size
|
||||
optimizer = OpenAIAdam(optimizer_grouped_parameters,
|
||||
lr=args.learning_rate,
|
||||
warmup=args.warmup_proportion,
|
||||
max_grad_norm=args.max_grad_norm,
|
||||
weight_decay=arsg.weight_decay,
|
||||
t_total=num_train_optimization_steps)
|
||||
|
||||
if args.do_train:
|
||||
global_step = 0
|
||||
nb_tr_steps = 0
|
||||
tr_loss = 0
|
||||
model.train()
|
||||
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
||||
tr_loss = 0
|
||||
nb_tr_examples, nb_tr_steps = 0, 0
|
||||
for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
|
||||
batch = tuple(t.to(device) for t in batch)
|
||||
input_ids, mc_token_mask, lm_labels, mc_labels = batch
|
||||
losses = model(input_ids, mc_token_mask, lm_labels, mc_labels)
|
||||
loss = args.lm_coef * losses[0] + losses[1]
|
||||
loss.backward()
|
||||
tr_loss += loss.item()
|
||||
nb_tr_examples += input_ids.size(0)
|
||||
nb_tr_steps += 1
|
||||
|
||||
# Save a trained model
|
||||
model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self
|
||||
output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
|
||||
if args.do_train:
|
||||
torch.save(model_to_save.state_dict(), output_model_file)
|
||||
|
||||
# Load a trained model that you have fine-tuned
|
||||
model_state_dict = torch.load(output_model_file)
|
||||
model = OpenAIGPTDoubleHeadsModel(args.mode, state_dict=model_state_dict, num_labels=num_labels)
|
||||
model.to(device)
|
||||
|
||||
if args.do_eval:
|
||||
eval_examples = processor.get_dev_examples(args.data_dir)
|
||||
eval_features = convert_examples_to_features(
|
||||
eval_examples, label_list, args.max_seq_length, tokenizer)
|
||||
logger.info("***** Running evaluation *****")
|
||||
logger.info(" Num examples = %d", len(eval_examples))
|
||||
logger.info(" Batch size = %d", args.eval_batch_size)
|
||||
all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
|
||||
all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
|
||||
all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
|
||||
all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
|
||||
eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
|
||||
# Run prediction for full data
|
||||
eval_sampler = SequentialSampler(eval_data)
|
||||
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
||||
|
||||
model.eval()
|
||||
eval_loss, eval_accuracy = 0, 0
|
||||
nb_eval_steps, nb_eval_examples = 0, 0
|
||||
|
||||
for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
|
||||
input_ids = input_ids.to(device)
|
||||
input_mask = input_mask.to(device)
|
||||
segment_ids = segment_ids.to(device)
|
||||
label_ids = label_ids.to(device)
|
||||
|
||||
with torch.no_grad():
|
||||
tmp_eval_loss = model(input_ids, segment_ids, input_mask, label_ids)
|
||||
logits = model(input_ids, segment_ids, input_mask)
|
||||
|
||||
logits = logits.detach().cpu().numpy()
|
||||
label_ids = label_ids.to('cpu').numpy()
|
||||
tmp_eval_accuracy = accuracy(logits, label_ids)
|
||||
|
||||
eval_loss += tmp_eval_loss.mean().item()
|
||||
eval_accuracy += tmp_eval_accuracy
|
||||
|
||||
nb_eval_examples += input_ids.size(0)
|
||||
nb_eval_steps += 1
|
||||
|
||||
eval_loss = eval_loss / nb_eval_steps
|
||||
eval_accuracy = eval_accuracy / nb_eval_examples
|
||||
loss = tr_loss/nb_tr_steps if args.do_train else None
|
||||
result = {'eval_loss': eval_loss,
|
||||
'eval_accuracy': eval_accuracy,
|
||||
'global_step': global_step,
|
||||
'loss': loss}
|
||||
|
||||
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
n_updates = 0
|
||||
n_epochs = 0
|
||||
if dataset != 'stsb':
|
||||
trYt = trY
|
||||
if submit:
|
||||
path = os.path.join(save_dir, desc, 'best_params')
|
||||
torch.save(dh_model.state_dict(), make_path(path))
|
||||
best_score = 0
|
||||
for i in range(args.n_iter):
|
||||
print("running epoch", i)
|
||||
run_epoch()
|
||||
n_epochs += 1
|
||||
log(save_dir, desc)
|
||||
if submit:
|
||||
path = os.path.join(save_dir, desc, 'best_params')
|
||||
dh_model.load_state_dict(torch.load(path))
|
||||
predict(dataset, args.submission_dir)
|
||||
if args.analysis:
|
||||
rocstories_analysis(data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'),
|
||||
os.path.join(log_dir, 'rocstories.jsonl'))
|
|
@ -16,17 +16,15 @@
|
|||
""" PyTorch Transformer XL model evaluation script.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
|
||||
This script with default values evaluates a pretrained Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import functools
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
import sys
|
||||
from io import open
|
||||
|
||||
import torch
|
||||
|
||||
|
@ -39,10 +37,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
# parser.add_argument('--data', type=str, default='../data/wikitext-103',
|
||||
# help='location of the data corpus')
|
||||
parser.add_argument('--model_name', type=str, default='transfo-xl-wt103',
|
||||
# choices=['transfo-xl-wt103'], #, 'lm1b', 'enwik8', 'text8'],
|
||||
help='pretrained model name')
|
||||
parser.add_argument('--split', type=str, default='test',
|
||||
choices=['all', 'valid', 'test'],
|
||||
|
@ -70,11 +65,11 @@ assert args.ext_len >= 0, 'extended context length must be non-negative'
|
|||
|
||||
device = torch.device("cuda" if args.cuda else "cpu")
|
||||
|
||||
# Get logger
|
||||
# logging = get_logger(os.path.join(args.work_dir, 'log.txt'),
|
||||
# log_=not args.no_log)
|
||||
|
||||
# Load dataset
|
||||
# Load a pre-processed dataset
|
||||
# You can also build the corpus yourself using TransfoXLCorpus methods
|
||||
# The pre-processing involve computing word frequencies to prepare the Adaptive input and SoftMax
|
||||
# and tokenizing the dataset
|
||||
# The pre-processed corpus is a convertion (using the conversion script )
|
||||
corpus = TransfoXLCorpus.from_pretrained(args.model_name)
|
||||
ntokens = len(corpus.vocab)
|
||||
|
||||
|
@ -83,10 +78,7 @@ va_iter = corpus.get_iterator('valid', args.batch_size, args.tgt_len,
|
|||
te_iter = corpus.get_iterator('test', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
|
||||
# Load the best saved model.
|
||||
# with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
|
||||
# model = torch.load(f)
|
||||
# model.backward_compatible()
|
||||
# Load a pre-trained model
|
||||
model = TransfoXLModel.from_pretrained(args.model_name)
|
||||
model = model.to(device)
|
||||
|
||||
|
@ -132,10 +124,6 @@ elif args.split == 'test':
|
|||
valid_loss = None
|
||||
|
||||
def format_log(loss, split):
|
||||
# if args.dataset in ['enwik8', 'text8']:
|
||||
# log_str = '| {0} loss {1:5.2f} | {0} bpc {2:9.5f} '.format(
|
||||
# split, loss, loss / math.log(2))
|
||||
# else:
|
||||
log_str = '| {0} loss {1:5.2f} | {0} ppl {2:9.3f} '.format(
|
||||
split, loss, math.exp(loss))
|
||||
return log_str
|
|
@ -0,0 +1,595 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HugginFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" PyTorch Transformer XL model training script.
|
||||
Adapted from https://github.com/kimiyoung/transformer-xl.
|
||||
In particular https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/eval.py
|
||||
|
||||
This script with default values train a Transformer-XL on WikiText 103
|
||||
"""
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
|
||||
import os
|
||||
import functools
|
||||
import argparse
|
||||
import logging
|
||||
import time
|
||||
import math
|
||||
import sys
|
||||
from io import open
|
||||
import itertools
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.optim as optim
|
||||
|
||||
from pytorch_pretrained_bert import TransfoXLModel, TransfoXLConfig
|
||||
from pytorch_pretrained_bert.tokenization_transfo_xl import get_lm_corpus
|
||||
|
||||
logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s',
|
||||
datefmt = '%m/%d/%Y %H:%M:%S',
|
||||
level = logging.INFO)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
parser = argparse.ArgumentParser(description='PyTorch Transformer Language Model')
|
||||
parser.add_argument('--data', type=str, default='../data/wikitext-103',
|
||||
help='location of the data corpus')
|
||||
parser.add_argument('--dataset', type=str, default='wt103',
|
||||
choices=['wt103', 'lm1b', 'enwik8', 'text8'],
|
||||
help='dataset name')
|
||||
parser.add_argument('--n_layer', type=int, default=12,
|
||||
help='number of total layers')
|
||||
parser.add_argument('--n_head', type=int, default=10,
|
||||
help='number of heads')
|
||||
parser.add_argument('--d_head', type=int, default=50,
|
||||
help='head dimension')
|
||||
parser.add_argument('--d_embed', type=int, default=-1,
|
||||
help='embedding dimension')
|
||||
parser.add_argument('--d_model', type=int, default=500,
|
||||
help='model dimension')
|
||||
parser.add_argument('--d_inner', type=int, default=1000,
|
||||
help='inner dimension in FF')
|
||||
parser.add_argument('--dropout', type=float, default=0.0,
|
||||
help='global dropout rate')
|
||||
parser.add_argument('--dropatt', type=float, default=0.0,
|
||||
help='attention probability dropout rate')
|
||||
parser.add_argument('--init', default='normal', type=str,
|
||||
help='parameter initializer to use.')
|
||||
parser.add_argument('--emb_init', default='normal', type=str,
|
||||
help='parameter initializer to use.')
|
||||
parser.add_argument('--init_range', type=float, default=0.1,
|
||||
help='parameters initialized by U(-init_range, init_range)')
|
||||
parser.add_argument('--emb_init_range', type=float, default=0.01,
|
||||
help='parameters initialized by U(-init_range, init_range)')
|
||||
parser.add_argument('--init_std', type=float, default=0.02,
|
||||
help='parameters initialized by N(0, init_std)')
|
||||
parser.add_argument('--proj_init_std', type=float, default=0.01,
|
||||
help='parameters initialized by N(0, init_std)')
|
||||
parser.add_argument('--optim', default='adam', type=str,
|
||||
choices=['adam', 'sgd', 'adagrad'],
|
||||
help='optimizer to use.')
|
||||
parser.add_argument('--lr', type=float, default=0.00025,
|
||||
help='initial learning rate (0.00025|5 for adam|sgd)')
|
||||
parser.add_argument('--mom', type=float, default=0.0,
|
||||
help='momentum for sgd')
|
||||
parser.add_argument('--scheduler', default='cosine', type=str,
|
||||
choices=['cosine', 'inv_sqrt', 'dev_perf', 'constant'],
|
||||
help='lr scheduler to use.')
|
||||
parser.add_argument('--warmup_step', type=int, default=0,
|
||||
help='upper epoch limit')
|
||||
parser.add_argument('--decay_rate', type=float, default=0.5,
|
||||
help='decay factor when ReduceLROnPlateau is used')
|
||||
parser.add_argument('--lr_min', type=float, default=0.0,
|
||||
help='minimum learning rate during annealing')
|
||||
parser.add_argument('--clip', type=float, default=0.25,
|
||||
help='gradient clipping')
|
||||
parser.add_argument('--clip_nonemb', action='store_true',
|
||||
help='only clip the gradient of non-embedding params')
|
||||
parser.add_argument('--max_step', type=int, default=100000,
|
||||
help='upper epoch limit')
|
||||
parser.add_argument('--batch_size', type=int, default=60,
|
||||
help='batch size')
|
||||
parser.add_argument('--batch_chunk', type=int, default=1,
|
||||
help='split batch into chunks to save memory')
|
||||
parser.add_argument('--tgt_len', type=int, default=70,
|
||||
help='number of tokens to predict')
|
||||
parser.add_argument('--eval_tgt_len', type=int, default=50,
|
||||
help='number of tokens to predict for evaluation')
|
||||
parser.add_argument('--ext_len', type=int, default=0,
|
||||
help='length of the extended context')
|
||||
parser.add_argument('--mem_len', type=int, default=0,
|
||||
help='length of the retained previous heads')
|
||||
parser.add_argument('--not_tied', action='store_true',
|
||||
help='do not tie the word embedding and softmax weights')
|
||||
parser.add_argument('--seed', type=int, default=1111,
|
||||
help='random seed')
|
||||
parser.add_argument('--cuda', action='store_true',
|
||||
help='use CUDA')
|
||||
parser.add_argument('--adaptive', action='store_true',
|
||||
help='use adaptive softmax')
|
||||
parser.add_argument('--div_val', type=int, default=1,
|
||||
help='divident value for adapative input and softmax')
|
||||
parser.add_argument('--pre_lnorm', action='store_true',
|
||||
help='apply LayerNorm to the input instead of the output')
|
||||
parser.add_argument('--varlen', action='store_true',
|
||||
help='use variable length')
|
||||
parser.add_argument('--multi_gpu', action='store_true',
|
||||
help='use multiple GPU')
|
||||
parser.add_argument('--log-interval', type=int, default=200,
|
||||
help='report interval')
|
||||
parser.add_argument('--eval-interval', type=int, default=4000,
|
||||
help='evaluation interval')
|
||||
parser.add_argument('--work_dir', default='LM-TFM', type=str,
|
||||
help='experiment directory.')
|
||||
parser.add_argument('--restart', action='store_true',
|
||||
help='restart training from the saved checkpoint')
|
||||
parser.add_argument('--restart_dir', type=str, default='',
|
||||
help='restart dir')
|
||||
parser.add_argument('--debug', action='store_true',
|
||||
help='run in debug mode (do not create exp dir)')
|
||||
parser.add_argument('--same_length', action='store_true',
|
||||
help='use the same attn length for all tokens')
|
||||
parser.add_argument('--attn_type', type=int, default=0,
|
||||
help='attention type. 0 for ours, 1 for Shaw et al,'
|
||||
'2 for Vaswani et al, 3 for Al Rfou et al.')
|
||||
parser.add_argument('--clamp_len', type=int, default=-1,
|
||||
help='use the same pos embeddings after clamp_len')
|
||||
parser.add_argument('--eta_min', type=float, default=0.0,
|
||||
help='min learning rate for cosine scheduler')
|
||||
parser.add_argument('--gpu0_bsz', type=int, default=-1,
|
||||
help='batch size on gpu 0')
|
||||
parser.add_argument('--max_eval_steps', type=int, default=-1,
|
||||
help='max eval steps')
|
||||
parser.add_argument('--sample_softmax', type=int, default=-1,
|
||||
help='number of samples in sampled softmax')
|
||||
parser.add_argument('--patience', type=int, default=0,
|
||||
help='patience')
|
||||
parser.add_argument('--finetune_v2', action='store_true',
|
||||
help='finetune v2')
|
||||
parser.add_argument('--finetune_v3', action='store_true',
|
||||
help='finetune v3')
|
||||
parser.add_argument('--fp16', action='store_true',
|
||||
help='Run in pseudo-fp16 mode (fp16 storage fp32 math).')
|
||||
parser.add_argument('--static-loss-scale', type=float, default=1,
|
||||
help='Static loss scale, positive power of 2 values can '
|
||||
'improve fp16 convergence.')
|
||||
parser.add_argument('--dynamic-loss-scale', action='store_true',
|
||||
help='Use dynamic loss scaling. If supplied, this argument'
|
||||
' supersedes --static-loss-scale.')
|
||||
args = parser.parse_args()
|
||||
args.tied = not args.not_tied
|
||||
|
||||
if args.d_embed < 0:
|
||||
args.d_embed = args.d_model
|
||||
|
||||
assert args.ext_len >= 0, 'extended context length must be non-negative'
|
||||
assert args.batch_size % args.batch_chunk == 0
|
||||
|
||||
args.work_dir = '{}-{}'.format(args.work_dir, args.dataset)
|
||||
args.work_dir = os.path.join(args.work_dir, time.strftime('%Y%m%d-%H%M%S'))
|
||||
# logging = create_exp_dir(args.work_dir,
|
||||
# scripts_to_save=['train.py', 'mem_transformer.py'], debug=args.debug)
|
||||
|
||||
# Set the random seed manually for reproducibility.
|
||||
np.random.seed(args.seed)
|
||||
torch.manual_seed(args.seed)
|
||||
if torch.cuda.is_available():
|
||||
if not args.cuda:
|
||||
print('WARNING: You have a CUDA device, so you should probably run with --cuda')
|
||||
else:
|
||||
torch.cuda.manual_seed_all(args.seed)
|
||||
|
||||
# Validate `--fp16` option
|
||||
if args.fp16:
|
||||
if not args.cuda:
|
||||
print('WARNING: --fp16 requires --cuda, ignoring --fp16 option')
|
||||
args.fp16 = False
|
||||
else:
|
||||
try:
|
||||
from apex.fp16_utils import FP16_Optimizer
|
||||
except ImportError:
|
||||
print('WARNING: apex not installed, ignoring --fp16 option')
|
||||
args.fp16 = False
|
||||
|
||||
device = torch.device('cuda' if args.cuda else 'cpu')
|
||||
|
||||
###############################################################################
|
||||
# Load data
|
||||
###############################################################################
|
||||
corpus = get_lm_corpus(args.data, args.dataset)
|
||||
ntokens = len(corpus.vocab)
|
||||
args.n_token = ntokens
|
||||
|
||||
eval_batch_size = 10
|
||||
tr_iter = corpus.get_iterator('train', args.batch_size, args.tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
va_iter = corpus.get_iterator('valid', eval_batch_size, args.eval_tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
te_iter = corpus.get_iterator('test', eval_batch_size, args.eval_tgt_len,
|
||||
device=device, ext_len=args.ext_len)
|
||||
|
||||
# adaptive softmax / embedding
|
||||
cutoffs = []
|
||||
if args.adaptive:
|
||||
assert args.dataset in ['wt103', 'lm1b']
|
||||
if args.dataset == 'wt103':
|
||||
cutoffs = [20000, 40000, 200000]
|
||||
proj_share_all_but_first = True
|
||||
elif args.dataset == 'lm1b':
|
||||
cutoffs = [60000, 100000, 640000]
|
||||
proj_share_all_but_first = False
|
||||
|
||||
###############################################################################
|
||||
# Build the model
|
||||
###############################################################################
|
||||
def init_weight(weight):
|
||||
if args.init == 'uniform':
|
||||
nn.init.uniform_(weight, -args.init_range, args.init_range)
|
||||
elif args.init == 'normal':
|
||||
nn.init.normal_(weight, 0.0, args.init_std)
|
||||
|
||||
def init_bias(bias):
|
||||
nn.init.constant_(bias, 0.0)
|
||||
|
||||
def weights_init(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Linear') != -1:
|
||||
if hasattr(m, 'weight') and m.weight is not None:
|
||||
init_weight(m.weight)
|
||||
if hasattr(m, 'bias') and m.bias is not None:
|
||||
init_bias(m.bias)
|
||||
elif classname.find('AdaptiveEmbedding') != -1:
|
||||
if hasattr(m, 'emb_projs'):
|
||||
for i in range(len(m.emb_projs)):
|
||||
if m.emb_projs[i] is not None:
|
||||
nn.init.normal_(m.emb_projs[i], 0.0, args.proj_init_std)
|
||||
elif classname.find('Embedding') != -1:
|
||||
if hasattr(m, 'weight'):
|
||||
init_weight(m.weight)
|
||||
elif classname.find('ProjectedAdaptiveLogSoftmax') != -1:
|
||||
if hasattr(m, 'cluster_weight') and m.cluster_weight is not None:
|
||||
init_weight(m.cluster_weight)
|
||||
if hasattr(m, 'cluster_bias') and m.cluster_bias is not None:
|
||||
init_bias(m.cluster_bias)
|
||||
if hasattr(m, 'out_projs'):
|
||||
for i in range(len(m.out_projs)):
|
||||
if m.out_projs[i] is not None:
|
||||
nn.init.normal_(m.out_projs[i], 0.0, args.proj_init_std)
|
||||
elif classname.find('LayerNorm') != -1:
|
||||
if hasattr(m, 'weight'):
|
||||
nn.init.normal_(m.weight, 1.0, args.init_std)
|
||||
if hasattr(m, 'bias') and m.bias is not None:
|
||||
init_bias(m.bias)
|
||||
elif classname.find('TransformerLM') != -1:
|
||||
if hasattr(m, 'r_emb'):
|
||||
init_weight(m.r_emb)
|
||||
if hasattr(m, 'r_w_bias'):
|
||||
init_weight(m.r_w_bias)
|
||||
if hasattr(m, 'r_r_bias'):
|
||||
init_weight(m.r_r_bias)
|
||||
if hasattr(m, 'r_bias'):
|
||||
init_bias(m.r_bias)
|
||||
|
||||
def update_dropout(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Dropout') != -1:
|
||||
if hasattr(m, 'p'):
|
||||
m.p = args.dropout
|
||||
|
||||
def update_dropatt(m):
|
||||
if hasattr(m, 'dropatt'):
|
||||
m.dropatt.p = args.dropatt
|
||||
|
||||
if args.restart:
|
||||
with open(os.path.join(args.restart_dir, 'model.pt'), 'rb') as f:
|
||||
model = torch.load(f)
|
||||
if not args.fp16:
|
||||
model = model.float()
|
||||
model.apply(update_dropout)
|
||||
model.apply(update_dropatt)
|
||||
else:
|
||||
config = TransfoXLConfig(ntokens, n_layer=args.n_layer, n_head=args.n_head,
|
||||
d_model=args.d_model, d_head=args.d_head, d_inner=args.d_inner,
|
||||
dropout=args.dropout, dropatt=args.dropatt,
|
||||
tie_weight=args.tied, d_embed=args.d_embed, div_val=args.div_val,
|
||||
proj_share_all_but_first=proj_share_all_but_first,
|
||||
pre_lnorm=args.pre_lnorm, tgt_len=args.tgt_len,
|
||||
ext_len=args.ext_len, mem_len=args.mem_len, cutoffs=cutoffs,
|
||||
same_length=args.same_length, attn_type=args.attn_type,
|
||||
clamp_len=args.clamp_len, sample_softmax=args.sample_softmax)
|
||||
model = TransfoXLModel(config)
|
||||
model.apply(weights_init)
|
||||
model.word_emb.apply(weights_init) # ensure embedding init is not overridden by out_layer in case of weight sharing
|
||||
args.n_all_param = sum([p.nelement() for p in model.parameters()])
|
||||
args.n_nonemb_param = sum([p.nelement() for p in model.layers.parameters()])
|
||||
|
||||
if args.fp16:
|
||||
model = model.half()
|
||||
|
||||
if args.multi_gpu:
|
||||
model = model.to(device)
|
||||
if args.gpu0_bsz >= 0:
|
||||
raise NotImplementedError
|
||||
# para_model = BalancedDataParallel(args.gpu0_bsz // args.batch_chunk,
|
||||
# model, dim=1).to(device)
|
||||
else:
|
||||
para_model = nn.DataParallel(model, dim=1).to(device)
|
||||
else:
|
||||
para_model = model.to(device)
|
||||
|
||||
#### optimizer
|
||||
if args.optim.lower() == 'sgd':
|
||||
if args.sample_softmax > 0:
|
||||
dense_params, sparse_params = [], []
|
||||
for param in model.parameters():
|
||||
if param.size() == model.word_emb.weight.size():
|
||||
sparse_params.append(param)
|
||||
else:
|
||||
dense_params.append(param)
|
||||
optimizer_sparse = optim.SGD(sparse_params, lr=args.lr * 2)
|
||||
optimizer = optim.SGD(dense_params, lr=args.lr, momentum=args.mom)
|
||||
else:
|
||||
optimizer = optim.SGD(model.parameters(), lr=args.lr,
|
||||
momentum=args.mom)
|
||||
elif args.optim.lower() == 'adam':
|
||||
if args.sample_softmax > 0:
|
||||
dense_params, sparse_params = [], []
|
||||
for param in model.parameters():
|
||||
if param.size() == model.word_emb.weight.size():
|
||||
sparse_params.append(param)
|
||||
else:
|
||||
dense_params.append(param)
|
||||
optimizer_sparse = optim.SparseAdam(sparse_params, lr=args.lr)
|
||||
optimizer = optim.Adam(dense_params, lr=args.lr)
|
||||
else:
|
||||
optimizer = optim.Adam(model.parameters(), lr=args.lr)
|
||||
elif args.optim.lower() == 'adagrad':
|
||||
optimizer = optim.Adagrad(model.parameters(), lr=args.lr)
|
||||
|
||||
#### scheduler
|
||||
if args.scheduler == 'cosine':
|
||||
# here we do not set eta_min to lr_min to be backward compatible
|
||||
# because in previous versions eta_min is default to 0
|
||||
# rather than the default value of lr_min 1e-6
|
||||
scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
|
||||
args.max_step, eta_min=args.eta_min) # should use eta_min arg
|
||||
if args.sample_softmax > 0:
|
||||
scheduler_sparse = optim.lr_scheduler.CosineAnnealingLR(optimizer_sparse,
|
||||
args.max_step, eta_min=args.eta_min) # should use eta_min arg
|
||||
elif args.scheduler == 'inv_sqrt':
|
||||
# originally used for Transformer (in Attention is all you need)
|
||||
def lr_lambda(step):
|
||||
# return a multiplier instead of a learning rate
|
||||
if step == 0 and args.warmup_step == 0:
|
||||
return 1.
|
||||
else:
|
||||
return 1. / (step ** 0.5) if step > args.warmup_step \
|
||||
else step / (args.warmup_step ** 1.5)
|
||||
scheduler = optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda)
|
||||
elif args.scheduler == 'dev_perf':
|
||||
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,
|
||||
factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
|
||||
if args.sample_softmax > 0:
|
||||
scheduler_sparse = optim.lr_scheduler.ReduceLROnPlateau(optimizer_sparse,
|
||||
factor=args.decay_rate, patience=args.patience, min_lr=args.lr_min)
|
||||
elif args.scheduler == 'constant':
|
||||
pass
|
||||
|
||||
if args.cuda and args.fp16:
|
||||
# If args.dynamic_loss_scale is False, static_loss_scale will be used.
|
||||
# If args.dynamic_loss_scale is True, it will take precedence over static_loss_scale.
|
||||
optimizer = FP16_Optimizer(optimizer,
|
||||
static_loss_scale = args.static_loss_scale,
|
||||
dynamic_loss_scale = args.dynamic_loss_scale,
|
||||
dynamic_loss_args = {'init_scale': 2 ** 16})
|
||||
|
||||
if args.restart:
|
||||
if os.path.exists(os.path.join(args.restart_dir, 'optimizer.pt')):
|
||||
with open(os.path.join(args.restart_dir, 'optimizer.pt'), 'rb') as f:
|
||||
opt_state_dict = torch.load(f)
|
||||
optimizer.load_state_dict(opt_state_dict)
|
||||
else:
|
||||
print('Optimizer was not saved. Start from scratch.')
|
||||
|
||||
logger.info('=' * 100)
|
||||
for k, v in args.__dict__.items():
|
||||
logger.info(' - {} : {}'.format(k, v))
|
||||
logger.info('=' * 100)
|
||||
logger.info('#params = {}'.format(args.n_all_param))
|
||||
logger.info('#non emb params = {}'.format(args.n_nonemb_param))
|
||||
|
||||
###############################################################################
|
||||
# Training code
|
||||
###############################################################################
|
||||
|
||||
def evaluate(eval_iter):
|
||||
# Turn on evaluation mode which disables dropout.
|
||||
model.eval()
|
||||
|
||||
# If the model does not use memory at all, make the ext_len longer.
|
||||
# Otherwise, make the mem_len longer and keep the ext_len the same.
|
||||
if args.mem_len == 0:
|
||||
model.reset_length(args.eval_tgt_len,
|
||||
args.ext_len+args.tgt_len-args.eval_tgt_len, args.mem_len)
|
||||
else:
|
||||
model.reset_length(args.eval_tgt_len,
|
||||
args.ext_len, args.mem_len+args.tgt_len-args.eval_tgt_len)
|
||||
|
||||
# Evaluation
|
||||
total_len, total_loss = 0, 0.
|
||||
with torch.no_grad():
|
||||
mems = tuple()
|
||||
for i, (data, target, seq_len) in enumerate(eval_iter):
|
||||
if args.max_eval_steps > 0 and i >= args.max_eval_steps:
|
||||
break
|
||||
ret = model(data, target, *mems)
|
||||
loss, mems = ret[0], ret[1:]
|
||||
loss = loss.mean()
|
||||
total_loss += seq_len * loss.float().item()
|
||||
total_len += seq_len
|
||||
|
||||
# Switch back to the training mode
|
||||
model.reset_length(args.tgt_len, args.ext_len, args.mem_len)
|
||||
model.train()
|
||||
|
||||
return total_loss / total_len
|
||||
|
||||
|
||||
def train():
|
||||
# Turn on training mode which enables dropout.
|
||||
global train_step, train_loss, best_val_loss, eval_start_time, log_start_time
|
||||
model.train()
|
||||
if args.batch_chunk > 1:
|
||||
mems = [tuple() for _ in range(args.batch_chunk)]
|
||||
else:
|
||||
mems = tuple()
|
||||
train_iter = tr_iter.get_varlen_iter() if args.varlen else tr_iter
|
||||
for batch, (data, target, seq_len) in enumerate(train_iter):
|
||||
model.zero_grad()
|
||||
if args.batch_chunk > 1:
|
||||
data_chunks = torch.chunk(data, args.batch_chunk, 1)
|
||||
target_chunks = torch.chunk(target, args.batch_chunk, 1)
|
||||
for i in range(args.batch_chunk):
|
||||
data_i = data_chunks[i].contiguous()
|
||||
target_i = target_chunks[i].contiguous()
|
||||
ret = para_model(data_i, target_i, *mems[i])
|
||||
loss, mems[i] = ret[0], ret[1:]
|
||||
loss = loss.float().mean().type_as(loss) / args.batch_chunk
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
train_loss += loss.float().item()
|
||||
else:
|
||||
ret = para_model(data, target, *mems)
|
||||
loss, mems = ret[0], ret[1:]
|
||||
loss = loss.float().mean().type_as(loss)
|
||||
if args.fp16:
|
||||
optimizer.backward(loss)
|
||||
else:
|
||||
loss.backward()
|
||||
train_loss += loss.float().item()
|
||||
|
||||
if args.fp16:
|
||||
optimizer.clip_master_grads(args.clip)
|
||||
else:
|
||||
torch.nn.utils.clip_grad_norm_(model.parameters(), args.clip)
|
||||
|
||||
optimizer.step()
|
||||
if args.sample_softmax > 0:
|
||||
optimizer_sparse.step()
|
||||
|
||||
# step-wise learning rate annealing
|
||||
train_step += 1
|
||||
if args.scheduler in ['cosine', 'constant', 'dev_perf']:
|
||||
# linear warmup stage
|
||||
if train_step < args.warmup_step:
|
||||
curr_lr = args.lr * train_step / args.warmup_step
|
||||
optimizer.param_groups[0]['lr'] = curr_lr
|
||||
if args.sample_softmax > 0:
|
||||
optimizer_sparse.param_groups[0]['lr'] = curr_lr * 2
|
||||
else:
|
||||
if args.scheduler == 'cosine':
|
||||
scheduler.step(train_step)
|
||||
if args.sample_softmax > 0:
|
||||
scheduler_sparse.step(train_step)
|
||||
elif args.scheduler == 'inv_sqrt':
|
||||
scheduler.step(train_step)
|
||||
|
||||
if train_step % args.log_interval == 0:
|
||||
cur_loss = train_loss / args.log_interval
|
||||
elapsed = time.time() - log_start_time
|
||||
log_str = '| epoch {:3d} step {:>8d} | {:>6d} batches | lr {:.3g} ' \
|
||||
'| ms/batch {:5.2f} | loss {:5.2f}'.format(
|
||||
epoch, train_step, batch+1, optimizer.param_groups[0]['lr'],
|
||||
elapsed * 1000 / args.log_interval, cur_loss)
|
||||
if args.dataset in ['enwik8', 'text8']:
|
||||
log_str += ' | bpc {:9.5f}'.format(cur_loss / math.log(2))
|
||||
else:
|
||||
log_str += ' | ppl {:9.3f}'.format(math.exp(cur_loss))
|
||||
logger.info(log_str)
|
||||
train_loss = 0
|
||||
log_start_time = time.time()
|
||||
|
||||
if train_step % args.eval_interval == 0:
|
||||
val_loss = evaluate(va_iter)
|
||||
logger.info('-' * 100)
|
||||
log_str = '| Eval {:3d} at step {:>8d} | time: {:5.2f}s ' \
|
||||
'| valid loss {:5.2f}'.format(
|
||||
train_step // args.eval_interval, train_step,
|
||||
(time.time() - eval_start_time), val_loss)
|
||||
if args.dataset in ['enwik8', 'text8']:
|
||||
log_str += ' | bpc {:9.5f}'.format(val_loss / math.log(2))
|
||||
else:
|
||||
log_str += ' | valid ppl {:9.3f}'.format(math.exp(val_loss))
|
||||
logger.info(log_str)
|
||||
logger.info('-' * 100)
|
||||
# Save the model if the validation loss is the best we've seen so far.
|
||||
if not best_val_loss or val_loss < best_val_loss:
|
||||
if not args.debug:
|
||||
with open(os.path.join(args.work_dir, 'model.pt'), 'wb') as f:
|
||||
torch.save(model, f)
|
||||
with open(os.path.join(args.work_dir, 'optimizer.pt'), 'wb') as f:
|
||||
torch.save(optimizer.state_dict(), f)
|
||||
best_val_loss = val_loss
|
||||
|
||||
# dev-performance based learning rate annealing
|
||||
if args.scheduler == 'dev_perf':
|
||||
scheduler.step(val_loss)
|
||||
if args.sample_softmax > 0:
|
||||
scheduler_sparse.step(val_loss)
|
||||
|
||||
eval_start_time = time.time()
|
||||
|
||||
if train_step == args.max_step:
|
||||
break
|
||||
|
||||
# Loop over epochs.
|
||||
train_step = 0
|
||||
train_loss = 0
|
||||
best_val_loss = None
|
||||
|
||||
log_start_time = time.time()
|
||||
eval_start_time = time.time()
|
||||
|
||||
# At any point you can hit Ctrl + C to break out of training early.
|
||||
try:
|
||||
for epoch in itertools.count(start=1):
|
||||
train()
|
||||
if train_step == args.max_step:
|
||||
logger.info('-' * 100)
|
||||
logger.info('End of training')
|
||||
break
|
||||
except KeyboardInterrupt:
|
||||
logger.info('-' * 100)
|
||||
logger.info('Exiting from training early')
|
||||
|
||||
# Load the best saved model.
|
||||
with open(os.path.join(args.work_dir, 'model.pt'), 'rb') as f:
|
||||
model = torch.load(f)
|
||||
para_model = model.to(device)
|
||||
|
||||
# Run on test data.
|
||||
test_loss = evaluate(te_iter)
|
||||
logger.info('=' * 100)
|
||||
if args.dataset in ['enwik8', 'text8']:
|
||||
logger.info('| End of training | test loss {:5.2f} | test bpc {:9.5f}'.format(
|
||||
test_loss, test_loss / math.log(2)))
|
||||
else:
|
||||
logger.info('| End of training | test loss {:5.2f} | test ppl {:9.3f}'.format(
|
||||
test_loss, math.exp(test_loss)))
|
||||
logger.info('=' * 100)
|
Loading…
Reference in New Issue