321 lines
14 KiB
Python
Executable File
321 lines
14 KiB
Python
Executable File
#!/usr/bin/env python
|
|
# coding=utf-8
|
|
# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
|
|
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
""" OpenAI GPT model fine-tuning script.
|
|
Adapted from https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/train.py
|
|
It self adapted from https://github.com/openai/finetune-transformer-lm/blob/master/train.py
|
|
|
|
This script with default values fine-tunes and evaluate a pretrained OpenAI GPT on the RocStories dataset:
|
|
python run_openai_gpt.py \
|
|
--model_name openai-gpt \
|
|
--do_train \
|
|
--do_eval \
|
|
--train_dataset "$ROC_STORIES_DIR/cloze_test_val__spring2016 - cloze_test_ALL_val.csv" \
|
|
--eval_dataset "$ROC_STORIES_DIR/cloze_test_test__spring2016 - cloze_test_ALL_test.csv" \
|
|
--output_dir ../log \
|
|
--train_batch_size 16 \
|
|
"""
|
|
import argparse
|
|
import csv
|
|
import logging
|
|
import os
|
|
import random
|
|
|
|
import numpy as np
|
|
import torch
|
|
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
|
|
from tqdm import tqdm, trange
|
|
|
|
from transformers import (
|
|
CONFIG_NAME,
|
|
WEIGHTS_NAME,
|
|
AdamW,
|
|
OpenAIGPTDoubleHeadsModel,
|
|
OpenAIGPTTokenizer,
|
|
get_linear_schedule_with_warmup,
|
|
)
|
|
|
|
|
|
logging.basicConfig(
|
|
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
def accuracy(out, labels):
|
|
outputs = np.argmax(out, axis=1)
|
|
return np.sum(outputs == labels)
|
|
|
|
|
|
def load_rocstories_dataset(dataset_path):
|
|
"""Output a list of tuples(story, 1st continuation, 2nd continuation, label)"""
|
|
with open(dataset_path, encoding="utf_8") as f:
|
|
f = csv.reader(f)
|
|
output = []
|
|
next(f) # skip the first line
|
|
for line in tqdm(f):
|
|
output.append((" ".join(line[1:5]), line[5], line[6], int(line[-1]) - 1))
|
|
return output
|
|
|
|
|
|
def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, delimiter_token, clf_token):
|
|
"""Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label)
|
|
|
|
To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation:
|
|
input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
|
"""
|
|
tensor_datasets = []
|
|
for dataset in encoded_datasets:
|
|
n_batch = len(dataset)
|
|
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
|
|
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
|
|
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
|
|
mc_labels = np.zeros((n_batch,), dtype=np.int64)
|
|
for (
|
|
i,
|
|
(story, cont1, cont2, mc_label),
|
|
) in enumerate(dataset):
|
|
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]
|
|
with_cont2 = [start_token] + story[:cap_length] + [delimiter_token] + cont2[:cap_length] + [clf_token]
|
|
input_ids[i, 0, : len(with_cont1)] = with_cont1
|
|
input_ids[i, 1, : len(with_cont2)] = with_cont2
|
|
mc_token_ids[i, 0] = len(with_cont1) - 1
|
|
mc_token_ids[i, 1] = len(with_cont2) - 1
|
|
lm_labels[i, 0, : len(with_cont1)] = with_cont1
|
|
lm_labels[i, 1, : len(with_cont2)] = with_cont2
|
|
mc_labels[i] = mc_label
|
|
all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels)
|
|
tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs))
|
|
return tensor_datasets
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--model_name", type=str, default="openai-gpt", help="pretrained model name")
|
|
parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
|
|
parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
|
|
parser.add_argument(
|
|
"--output_dir",
|
|
default=None,
|
|
type=str,
|
|
required=True,
|
|
help="The output directory where the model predictions and checkpoints will be written.",
|
|
)
|
|
parser.add_argument("--train_dataset", type=str, default="")
|
|
parser.add_argument("--eval_dataset", type=str, default="")
|
|
parser.add_argument("--seed", type=int, default=42)
|
|
parser.add_argument("--num_train_epochs", type=int, default=3)
|
|
parser.add_argument("--train_batch_size", type=int, default=8)
|
|
parser.add_argument("--eval_batch_size", type=int, default=16)
|
|
parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
|
|
parser.add_argument("--max_grad_norm", type=int, default=1)
|
|
parser.add_argument(
|
|
"--max_steps",
|
|
default=-1,
|
|
type=int,
|
|
help=(
|
|
"If > 0: set total number of training steps to perform. Override num_train_epochs."
|
|
),
|
|
)
|
|
parser.add_argument(
|
|
"--gradient_accumulation_steps",
|
|
type=int,
|
|
default=1,
|
|
help="Number of updates steps to accumulate before performing a backward/update pass.",
|
|
)
|
|
parser.add_argument("--learning_rate", type=float, default=6.25e-5)
|
|
parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
|
|
parser.add_argument("--lr_schedule", type=str, default="warmup_linear")
|
|
parser.add_argument("--weight_decay", type=float, default=0.01)
|
|
parser.add_argument("--lm_coef", type=float, default=0.9)
|
|
parser.add_argument("--n_valid", type=int, default=374)
|
|
|
|
parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
|
|
parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")
|
|
args = parser.parse_args()
|
|
print(args)
|
|
|
|
if args.server_ip and args.server_port:
|
|
# Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
|
|
import ptvsd
|
|
|
|
print("Waiting for debugger attach")
|
|
ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
|
|
ptvsd.wait_for_attach()
|
|
|
|
random.seed(args.seed)
|
|
np.random.seed(args.seed)
|
|
torch.manual_seed(args.seed)
|
|
torch.cuda.manual_seed_all(args.seed)
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
n_gpu = torch.cuda.device_count()
|
|
logger.info("device: {}, n_gpu {}".format(device, n_gpu))
|
|
|
|
if not args.do_train and not args.do_eval:
|
|
raise ValueError("At least one of `do_train` or `do_eval` must be True.")
|
|
|
|
if not os.path.exists(args.output_dir):
|
|
os.makedirs(args.output_dir)
|
|
|
|
# Load tokenizer and model
|
|
# This loading functions also add new tokens and embeddings called `special tokens`
|
|
# These new embeddings will be fine-tuned on the RocStories dataset
|
|
special_tokens = ["_start_", "_delimiter_", "_classify_"]
|
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name)
|
|
tokenizer.add_tokens(special_tokens)
|
|
special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens)
|
|
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name)
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
model.to(device)
|
|
|
|
# Load and encode the datasets
|
|
def tokenize_and_encode(obj):
|
|
"""Tokenize and encode a nested object"""
|
|
if isinstance(obj, str):
|
|
return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj))
|
|
elif isinstance(obj, int):
|
|
return obj
|
|
return [tokenize_and_encode(o) for o in obj]
|
|
|
|
logger.info("Encoding dataset...")
|
|
train_dataset = load_rocstories_dataset(args.train_dataset)
|
|
eval_dataset = load_rocstories_dataset(args.eval_dataset)
|
|
datasets = (train_dataset, eval_dataset)
|
|
encoded_datasets = tokenize_and_encode(datasets)
|
|
|
|
# Compute the max input length for the Transformer
|
|
max_length = model.config.n_positions // 2 - 2
|
|
input_length = max(
|
|
len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3
|
|
for dataset in encoded_datasets
|
|
for story, cont1, cont2, _ in dataset
|
|
)
|
|
input_length = min(input_length, model.config.n_positions) # Max size of input for the pre-trained model
|
|
|
|
# Prepare inputs tensors and dataloaders
|
|
tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids)
|
|
train_tensor_dataset, eval_tensor_dataset = tensor_datasets[0], tensor_datasets[1]
|
|
|
|
train_data = TensorDataset(*train_tensor_dataset)
|
|
train_sampler = RandomSampler(train_data)
|
|
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
|
|
|
|
eval_data = TensorDataset(*eval_tensor_dataset)
|
|
eval_sampler = SequentialSampler(eval_data)
|
|
eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
|
|
|
|
# Prepare optimizer
|
|
if args.do_train:
|
|
if args.max_steps > 0:
|
|
t_total = args.max_steps
|
|
args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
|
|
else:
|
|
t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
|
|
|
|
param_optimizer = list(model.named_parameters())
|
|
no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
|
|
optimizer_grouped_parameters = [
|
|
{
|
|
"params": [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
|
|
"weight_decay": args.weight_decay,
|
|
},
|
|
{"params": [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
|
|
]
|
|
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
|
|
scheduler = get_linear_schedule_with_warmup(
|
|
optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
|
|
)
|
|
|
|
if args.do_train:
|
|
nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None
|
|
model.train()
|
|
for _ in trange(int(args.num_train_epochs), desc="Epoch"):
|
|
tr_loss = 0
|
|
nb_tr_steps = 0
|
|
tqdm_bar = tqdm(train_dataloader, desc="Training")
|
|
for step, batch in enumerate(tqdm_bar):
|
|
batch = tuple(t.to(device) for t in batch)
|
|
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
|
losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels)
|
|
loss = args.lm_coef * losses[0] + losses[1]
|
|
loss.backward()
|
|
optimizer.step()
|
|
scheduler.step()
|
|
optimizer.zero_grad()
|
|
tr_loss += loss.item()
|
|
exp_average_loss = (
|
|
loss.item() if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item()
|
|
)
|
|
nb_tr_steps += 1
|
|
tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format(exp_average_loss, scheduler.get_lr()[0])
|
|
|
|
# Save a trained model
|
|
if args.do_train:
|
|
# Save a trained model, configuration and tokenizer
|
|
model_to_save = model.module if hasattr(model, "module") else model # Only save the model itself
|
|
|
|
# If we save using the predefined names, we can load using `from_pretrained`
|
|
output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
|
|
output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
|
|
|
|
torch.save(model_to_save.state_dict(), output_model_file)
|
|
model_to_save.config.to_json_file(output_config_file)
|
|
tokenizer.save_vocabulary(args.output_dir)
|
|
|
|
# Load a trained model and vocabulary that you have fine-tuned
|
|
model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir)
|
|
tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir)
|
|
model.to(device)
|
|
|
|
if args.do_eval:
|
|
model.eval()
|
|
eval_loss, eval_accuracy = 0, 0
|
|
nb_eval_steps, nb_eval_examples = 0, 0
|
|
for batch in tqdm(eval_dataloader, desc="Evaluating"):
|
|
batch = tuple(t.to(device) for t in batch)
|
|
input_ids, mc_token_ids, lm_labels, mc_labels = batch
|
|
with torch.no_grad():
|
|
_, mc_loss, _, mc_logits = model(
|
|
input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels
|
|
)
|
|
|
|
mc_logits = mc_logits.detach().cpu().numpy()
|
|
mc_labels = mc_labels.to("cpu").numpy()
|
|
tmp_eval_accuracy = accuracy(mc_logits, mc_labels)
|
|
|
|
eval_loss += mc_loss.mean().item()
|
|
eval_accuracy += tmp_eval_accuracy
|
|
|
|
nb_eval_examples += input_ids.size(0)
|
|
nb_eval_steps += 1
|
|
|
|
eval_loss = eval_loss / nb_eval_steps
|
|
eval_accuracy = eval_accuracy / nb_eval_examples
|
|
train_loss = tr_loss / nb_tr_steps if args.do_train else None
|
|
result = {"eval_loss": eval_loss, "eval_accuracy": eval_accuracy, "train_loss": train_loss}
|
|
|
|
output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
|
|
with open(output_eval_file, "w") as writer:
|
|
logger.info("***** Eval results *****")
|
|
for key in sorted(result.keys()):
|
|
logger.info(" %s = %s", key, str(result[key]))
|
|
writer.write("%s = %s\n" % (key, str(result[key])))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|