From 22e7c4edaf007d92912df54c336d10078ac7d565 Mon Sep 17 00:00:00 2001 From: erenup Date: Thu, 3 Oct 2019 18:33:53 +0800 Subject: [PATCH 001/302] fixing for roberta tokenizer decoding --- examples/run_squad.py | 4 ++-- examples/utils_squad.py | 37 ++++++++++++++++++++++--------------- 2 files changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 0c0fbf2963..8a9f123d20 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -263,7 +263,7 @@ def evaluate(args, model, tokenizer, prefix=""): write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, args.model_type) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, @@ -296,7 +296,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, - is_training=not evaluate) + is_training=not evaluate, add_prefix_space=True if args.model_type == 'roberta' else False) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/utils_squad.py b/examples/utils_squad.py index b990ecc842..82a4b96b79 100644 --- a/examples/utils_squad.py +++ b/examples/utils_squad.py @@ -25,6 +25,7 @@ import collections from io import open from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize +from transformers.tokenization_roberta import RobertaTokenizer # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores @@ -192,7 +193,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True): + mask_padding_with_zero=True, add_prefix_space=False): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -205,8 +206,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # if example_index % 100 == 0: # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - - query_tokens = tokenizer.tokenize(example.question_text) + query_tokens = tokenizer.tokenize(example.question_text, add_prefix_space=add_prefix_space) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] @@ -216,7 +216,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) + sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) @@ -234,7 +234,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) + example.orig_answer_text, add_prefix_space) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -398,7 +398,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): + orig_answer_text, add_prefix_space): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -423,7 +423,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, # the word "Japanese". Since our WordPiece tokenizer does not split # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text, add_prefix_space=add_prefix_space)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): @@ -477,7 +477,7 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): + version_2_with_negative, null_score_diff_threshold, tokenizer, mode_type='bert'): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -576,15 +576,22 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + if mode_type == 'roberta': + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + tok_text = tok_text.replace("##", "") + tok_text = " ".join(tok_text.strip().split()) + orig_text = " ".join(orig_tokens) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, None) + else: + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue From b5d73976ad7e701e912664deacc4d44d0adefd05 Mon Sep 17 00:00:00 2001 From: erenup Date: Thu, 3 Oct 2019 20:48:17 +0800 Subject: [PATCH 002/302] Revert "fixing for roberta tokenizer decoding" This reverts commit 22e7c4edaf007d92912df54c336d10078ac7d565. --- examples/run_squad.py | 4 ++-- examples/utils_squad.py | 37 +++++++++++++++---------------------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 8a9f123d20..0c0fbf2963 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -263,7 +263,7 @@ def evaluate(args, model, tokenizer, prefix=""): write_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold, tokenizer, args.model_type) + args.version_2_with_negative, args.null_score_diff_threshold) # Evaluate with the official SQuAD script evaluate_options = EVAL_OPTS(data_file=args.predict_file, @@ -296,7 +296,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, - is_training=not evaluate, add_prefix_space=True if args.model_type == 'roberta' else False) + is_training=not evaluate) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/examples/utils_squad.py b/examples/utils_squad.py index 82a4b96b79..b990ecc842 100644 --- a/examples/utils_squad.py +++ b/examples/utils_squad.py @@ -25,7 +25,6 @@ import collections from io import open from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize -from transformers.tokenization_roberta import RobertaTokenizer # Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores @@ -193,7 +192,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, add_prefix_space=False): + mask_padding_with_zero=True): """Loads a data file into a list of `InputBatch`s.""" unique_id = 1000000000 @@ -206,7 +205,8 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, # if example_index % 100 == 0: # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - query_tokens = tokenizer.tokenize(example.question_text, add_prefix_space=add_prefix_space) + + query_tokens = tokenizer.tokenize(example.question_text) if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] @@ -216,7 +216,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, all_doc_tokens = [] for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token, add_prefix_space=add_prefix_space) + sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) @@ -234,7 +234,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = len(all_doc_tokens) - 1 (tok_start_position, tok_end_position) = _improve_answer_span( all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text, add_prefix_space) + example.orig_answer_text) # The -3 accounts for [CLS], [SEP] and [SEP] max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 @@ -398,7 +398,7 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length, def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text, add_prefix_space): + orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" # The SQuAD annotations are character based. We first project them to @@ -423,7 +423,7 @@ def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, # the word "Japanese". Since our WordPiece tokenizer does not split # "Japanese", we just use "Japanese" as the annotation. This is fairly rare # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text, add_prefix_space=add_prefix_space)) + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): @@ -477,7 +477,7 @@ RawResult = collections.namedtuple("RawResult", def write_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold, tokenizer, mode_type='bert'): + version_2_with_negative, null_score_diff_threshold): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -576,22 +576,15 @@ def write_predictions(all_examples, all_features, all_results, n_best_size, tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. - if mode_type == 'roberta': - tok_text = tokenizer.convert_tokens_to_string(tok_tokens) - tok_text = tok_text.replace("##", "") - tok_text = " ".join(tok_text.strip().split()) - orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging, None) - else: - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: continue From 3a52b65795f7a81f6f0ae48d96e235388fc86b87 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Mon, 21 Oct 2019 12:55:51 +0800 Subject: [PATCH 003/302] Add special tokens to documentation for bert examples to resolve issue: #1561 --- transformers/modeling_bert.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 8c92241fa2..9a9cd31b4b 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -557,7 +557,7 @@ class BertModel(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertModel.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -667,7 +667,7 @@ class BertForPreTraining(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForPreTraining.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] @@ -739,7 +739,7 @@ class BertForMaskedLM(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMaskedLM.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] @@ -808,7 +808,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] @@ -871,7 +871,7 @@ class BertForSequenceClassification(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForSequenceClassification.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] @@ -945,7 +945,7 @@ class BertForMultipleChoice(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForMultipleChoice.from_pretrained('bert-base-uncased') choices = ["Hello, my dog is cute", "Hello, my cat is amazing"] - input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices labels = torch.tensor(1).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, classification_scores = outputs[:2] @@ -1017,7 +1017,7 @@ class BertForTokenClassification(BertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = BertForTokenClassification.from_pretrained('bert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, scores = outputs[:2] From 6e011690a980c3c3f69fdfc3af8705859250cc6b Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 27 Oct 2019 13:59:14 +0800 Subject: [PATCH 004/302] Add special tokens to documentation for the rest of pytorch model examples #1561 --- transformers/modeling_ctrl.py | 4 ++-- transformers/modeling_distilbert.py | 8 ++++---- transformers/modeling_gpt2.py | 4 ++-- transformers/modeling_openai.py | 4 ++-- transformers/modeling_roberta.py | 6 +++--- transformers/modeling_transfo_xl.py | 4 ++-- transformers/modeling_xlm.py | 10 +++++----- transformers/modeling_xlnet.py | 10 +++++----- 8 files changed, 25 insertions(+), 25 deletions(-) diff --git a/transformers/modeling_ctrl.py b/transformers/modeling_ctrl.py index 55e64d318b..b31755e13d 100644 --- a/transformers/modeling_ctrl.py +++ b/transformers/modeling_ctrl.py @@ -261,7 +261,7 @@ class CTRLModel(CTRLPreTrainedModel): tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLModel.from_pretrained('ctrl') - input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -438,7 +438,7 @@ class CTRLLMHeadModel(CTRLPreTrainedModel): tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = CTRLLMHeadModel.from_pretrained('ctrl') - input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py index d3b4ccff5d..990d76e378 100644 --- a/transformers/modeling_distilbert.py +++ b/transformers/modeling_distilbert.py @@ -411,7 +411,7 @@ class DistilBertModel(DistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertModel.from_pretrained('distilbert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -495,7 +495,7 @@ class DistilBertForMaskedLM(DistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] @@ -569,7 +569,7 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] @@ -643,7 +643,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = DistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py index 0b5b83aa75..87878abb4e 100644 --- a/transformers/modeling_gpt2.py +++ b/transformers/modeling_gpt2.py @@ -338,7 +338,7 @@ class GPT2Model(GPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2Model.from_pretrained('gpt2') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -503,7 +503,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = GPT2LMHeadModel.from_pretrained('gpt2') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py index 52f3b7db72..c6b13dee4e 100644 --- a/transformers/modeling_openai.py +++ b/transformers/modeling_openai.py @@ -343,7 +343,7 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTModel.from_pretrained('openai-gpt') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -478,7 +478,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index eb340dc7fb..cbd2e0106d 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -154,7 +154,7 @@ class RobertaModel(BertModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaModel.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -209,7 +209,7 @@ class RobertaForMaskedLM(BertPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForMaskedLM.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) loss, prediction_scores = outputs[:2] @@ -303,7 +303,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = RobertaForSequenceClassification.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index 6d430e1804..ad1c7bdea4 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -578,7 +578,7 @@ class TransfoXLModel(TransfoXLPreTrainedModel): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLModel.from_pretrained('transfo-xl-wt103') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] @@ -808,7 +808,7 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index b29e721556..a7c8f4e941 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -332,7 +332,7 @@ class XLMModel(XLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMModel.from_pretrained('xlm-mlm-en-2048') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -607,7 +607,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -671,7 +671,7 @@ class XLMForSequenceClassification(XLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] @@ -754,7 +754,7 @@ class XLMForQuestionAnsweringSimple(XLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) @@ -849,7 +849,7 @@ class XLMForQuestionAnswering(XLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index e191ebadd0..fab405fd2b 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -584,7 +584,7 @@ class XLNetModel(XLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetModel.from_pretrained('xlnet-large-cased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -900,7 +900,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ")).unsqueeze(0) # We will predict the masked token + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very ", add_special_tokens=True)).unsqueeze(0) # We will predict the masked token perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float) # Shape [1, 1, seq_length] => let's predict one token @@ -983,7 +983,7 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=labels) loss, logits = outputs[:2] @@ -1163,7 +1163,7 @@ class XLNetForQuestionAnsweringSimple(XLNetPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) @@ -1276,7 +1276,7 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 start_positions = torch.tensor([1]) end_positions = torch.tensor([3]) outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) From ec276d6abad7eae800f1a1a039ddc78fde406009 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 27 Oct 2019 14:00:40 +0800 Subject: [PATCH 005/302] Add special tokens to documentation for the tensorflow model examples #1561 --- transformers/modeling_tf_bert.py | 14 +++++++------- transformers/modeling_tf_ctrl.py | 4 ++-- transformers/modeling_tf_distilbert.py | 8 ++++---- transformers/modeling_tf_gpt2.py | 4 ++-- transformers/modeling_tf_openai.py | 4 ++-- transformers/modeling_tf_roberta.py | 6 +++--- transformers/modeling_tf_transfo_xl.py | 4 ++-- transformers/modeling_tf_xlm.py | 8 ++++---- transformers/modeling_tf_xlnet.py | 10 +++++----- 9 files changed, 31 insertions(+), 31 deletions(-) diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index afe9b2946b..d2d3c7be37 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -647,7 +647,7 @@ class TFBertModel(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertModel.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -686,7 +686,7 @@ class TFBertForPreTraining(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForPreTraining.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, seq_relationship_scores = outputs[:2] @@ -732,7 +732,7 @@ class TFBertForMaskedLM(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForMaskedLM.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] @@ -776,7 +776,7 @@ class TFBertForNextSentencePrediction(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForNextSentencePrediction.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) seq_relationship_scores = outputs[0] @@ -821,7 +821,7 @@ class TFBertForSequenceClassification(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -952,7 +952,7 @@ class TFBertForTokenClassification(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForTokenClassification.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) scores = outputs[0] @@ -1005,7 +1005,7 @@ class TFBertForQuestionAnswering(TFBertPreTrainedModel): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') model = TFBertForQuestionAnswering.from_pretrained('bert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] diff --git a/transformers/modeling_tf_ctrl.py b/transformers/modeling_tf_ctrl.py index c8d181548b..66766a066e 100644 --- a/transformers/modeling_tf_ctrl.py +++ b/transformers/modeling_tf_ctrl.py @@ -402,7 +402,7 @@ class TFCTRLModel(TFCTRLPreTrainedModel): tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLModel.from_pretrained('ctrl') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -465,7 +465,7 @@ class TFCTRLLMHeadModel(TFCTRLPreTrainedModel): tokenizer = CTRLTokenizer.from_pretrained('ctrl') model = TFCTRLLMHeadModel.from_pretrained('ctrl') - input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + input_ids = torch.tensor(tokenizer.encode("Links Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1 outputs = model(input_ids, labels=input_ids) loss, logits = outputs[:2] diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index 188394816e..c2d0f73999 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -532,7 +532,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -590,7 +590,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] @@ -645,7 +645,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -702,7 +702,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py index 4188b273ba..24f2857b80 100644 --- a/transformers/modeling_tf_gpt2.py +++ b/transformers/modeling_tf_gpt2.py @@ -436,7 +436,7 @@ class TFGPT2Model(TFGPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2Model.from_pretrained('gpt2') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -477,7 +477,7 @@ class TFGPT2LMHeadModel(TFGPT2PreTrainedModel): tokenizer = GPT2Tokenizer.from_pretrained('gpt2') model = TFGPT2LMHeadModel.from_pretrained('gpt2') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py index 747c5171fd..08034b2d2e 100644 --- a/transformers/modeling_tf_openai.py +++ b/transformers/modeling_tf_openai.py @@ -413,7 +413,7 @@ class TFOpenAIGPTModel(TFOpenAIGPTPreTrainedModel): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTModel.from_pretrained('openai-gpt') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -449,7 +449,7 @@ class TFOpenAIGPTLMHeadModel(TFOpenAIGPTPreTrainedModel): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model = TFOpenAIGPTLMHeadModel.from_pretrained('openai-gpt') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py index 244c83f2b3..dcf00c3add 100644 --- a/transformers/modeling_tf_roberta.py +++ b/transformers/modeling_tf_roberta.py @@ -204,7 +204,7 @@ class TFRobertaModel(TFRobertaPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaModel.from_pretrained('roberta-base') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -281,7 +281,7 @@ class TFRobertaForMaskedLM(TFRobertaPreTrainedModel): tokenizer = RobertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForMaskedLM.from_pretrained('roberta-base') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids, masked_lm_labels=input_ids) prediction_scores = outputs[0] @@ -349,7 +349,7 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel): tokenizer = RoertaTokenizer.from_pretrained('roberta-base') model = TFRobertaForSequenceClassification.from_pretrained('roberta-base') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index a3e403ce06..87863163f0 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -654,7 +654,7 @@ class TFTransfoXLModel(TFTransfoXLPreTrainedModel): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLModel.from_pretrained('transfo-xl-wt103') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states, mems = outputs[:2] @@ -696,7 +696,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103') model = TFTransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores, mems = outputs[:2] diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py index 84de1517ee..7b7305f22e 100644 --- a/transformers/modeling_tf_xlm.py +++ b/transformers/modeling_tf_xlm.py @@ -550,7 +550,7 @@ class TFXLMModel(TFXLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMModel.from_pretrained('xlm-mlm-en-2048') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -623,7 +623,7 @@ class TFXLMWithLMHeadModel(TFXLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -667,7 +667,7 @@ class TFXLMForSequenceClassification(TFXLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 labels = tf.constant([1])[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -715,7 +715,7 @@ class TFXLMForQuestionAnsweringSimple(TFXLMPreTrainedModel): tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') model = TFXLMForQuestionAnsweringSimple.from_pretrained('xlm-mlm-en-2048') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 8a25be78c1..d2029db485 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -791,7 +791,7 @@ class TFXLNetModel(TFXLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetModel.from_pretrained('xlnet-large-cased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -835,7 +835,7 @@ class TFXLNetLMHeadModel(TFXLNetPreTrainedModel): model = TFXLNetLMHeadModel.from_pretrained('xlnet-large-cased') # We show how to setup inputs to predict a next token using a bi-directional context. - input_ids = tf.constant(tokenizer.encode("Hello, my dog is very "))[None, :] # We will predict the masked token + input_ids = tf.constant(tokenizer.encode("Hello, my dog is very ", add_special_tokens=True))[None, :] # We will predict the masked token perm_mask = tf.zeros((1, input_ids.shape[1], input_ids.shape[1])) perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token target_mapping = tf.zeros((1, 1, input_ids.shape[1])) # Shape [1, 1, seq_length] => let's predict one token @@ -888,7 +888,7 @@ class TFXLNetForSequenceClassification(TFXLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased') model = TFXLNetForSequenceClassification.from_pretrained('xlnet-large-cased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -946,7 +946,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') model = TFXLNetForQuestionAnsweringSimple.from_pretrained('xlnet-base-cased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] @@ -1010,7 +1010,7 @@ class TFXLNetForQuestionAnsweringSimple(TFXLNetPreTrainedModel): # tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048') # model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased') -# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 +# input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 # start_positions = tf.constant([1]) # end_positions = tf.constant([3]) # outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) From d36680df546a9d4a20c58c0eab2b14ee054436ca Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 27 Oct 2019 14:51:36 +0800 Subject: [PATCH 006/302] Rever changes to TF distilbert due to failed test: TFDistilBertModelTest.test_pt_tf_model_equivalence --- transformers/modeling_tf_distilbert.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py index c2d0f73999..188394816e 100644 --- a/transformers/modeling_tf_distilbert.py +++ b/transformers/modeling_tf_distilbert.py @@ -532,7 +532,7 @@ class TFDistilBertModel(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertModel.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -590,7 +590,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] @@ -645,7 +645,7 @@ class TFDistilBertForSequenceClassification(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) logits = outputs[0] @@ -702,7 +702,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel): tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased') model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1 + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) start_scores, end_scores = outputs[:2] From dfb61caf77a02a735af0bf430f6d6082b7d01cfd Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 5 Nov 2019 11:25:13 +0100 Subject: [PATCH 007/302] fix #1692 --- transformers/modeling_tf_xlnet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 8a25be78c1..e83674e214 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -539,7 +539,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): assert input_mask is None or attention_mask is None, "You can only use one of input_mask (uses 1 for padding) " \ "or attention_mask (uses 0 for padding, added for compatbility with BERT). Please choose one." if input_mask is None and attention_mask is not None: - input_mask = 1.0 - attention_mask + input_mask = 1.0 - tf.cast(attention_mask, dtype=dtype_float) if input_mask is not None and perm_mask is not None: data_mask = input_mask[None] + perm_mask elif input_mask is not None and perm_mask is None: From 60a5babd57dd80f855df859abf006ee4488ff639 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 5 Nov 2019 12:01:23 +0100 Subject: [PATCH 008/302] adding files --- transformers/configuration_t5.py | 130 +++++ ...rt_t5_original_tf_checkpoint_to_pytorch.py | 65 +++ transformers/modeling_t5.py | 373 +++++++++++++ transformers/modeling_tf_t5.py | 496 ++++++++++++++++++ transformers/tokenization_t5.py | 214 ++++++++ 5 files changed, 1278 insertions(+) create mode 100644 transformers/configuration_t5.py create mode 100755 transformers/convert_t5_original_tf_checkpoint_to_pytorch.py create mode 100644 transformers/modeling_t5.py create mode 100644 transformers/modeling_tf_t5.py create mode 100644 transformers/tokenization_t5.py diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py new file mode 100644 index 0000000000..a37a5b2157 --- /dev/null +++ b/transformers/configuration_t5.py @@ -0,0 +1,130 @@ +# coding=utf-8 +# Copyright 2010, The T5 Authors and HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" T5 model configuration """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import sys +import six +from io import open + +from .configuration_utils import PretrainedConfig + +logger = logging.getLogger(__name__) + +T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-config.json", + 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-config.json", +} + + +class T5Config(PretrainedConfig): + r""" + :class:`~transformers.T5Config` is the configuration class to store the configuration of a + `T5Model`. + + + Arguments: + vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`. + hidden_size: Size of the encoder layers and the pooler layer. + num_hidden_layers: Number of hidden layers in the Transformer encoder. + num_attention_heads: Number of attention heads for each attention layer in + the Transformer encoder. + intermediate_size: The size of the "intermediate" (i.e., feed-forward) + layer in the Transformer encoder. + hidden_act: The non-linear activation function (function or string) in the + encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported. + hidden_dropout_prob: The dropout probabilitiy for all fully connected + layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob: The dropout ratio for the attention + probabilities. + max_position_embeddings: The maximum sequence length that this model might + ever be used with. Typically set this to something large just in case + (e.g., 512 or 1024 or 2048). + type_vocab_size: The vocabulary size of the `token_type_ids` passed into + `T5Model`. + initializer_range: The sttdev of the truncated_normal_initializer for + initializing all weight matrices. + layer_norm_eps: The epsilon used by LayerNorm. + """ + pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP + + def __init__(self, + vocab_size_or_config_json_file=50257, + n_positions=1024, + n_ctx=1024, + n_embd=768, + n_layer=12, + n_head=12, + resid_pdrop=0.1, + embd_pdrop=0.1, + attn_pdrop=0.1, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + + num_labels=1, + summary_type='cls_index', + summary_use_proj=True, + summary_activation=None, + summary_proj_to_labels=True, + summary_first_dropout=0.1, + **kwargs): + super(T5Config, self).__init__(**kwargs) + self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1 + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + + self.num_labels = num_labels + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels + if isinstance(vocab_size_or_config_json_file, six.string_types): + with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + json_config = json.loads(reader.read()) + for key, value in json_config.items(): + self.__dict__[key] = value + elif not isinstance(vocab_size_or_config_json_file, int): + raise ValueError( + "First argument must be either a vocabulary size (int)" + "or the path to a pretrained model config file (str)" + ) + + @property + def max_position_embeddings(self): + return self.n_positions + + @property + def hidden_size(self): + return self.n_embd + + @property + def num_attention_heads(self): + return self.n_head + + @property + def num_hidden_layers(self): + return self.n_layer diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py new file mode 100755 index 0000000000..608027ebac --- /dev/null +++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2018 The T5 authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert T5 checkpoint.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import argparse +import torch + +from transformers import T5Config, T5ForPreTraining, load_tf_weights_in_t5 + +import logging +logging.basicConfig(level=logging.INFO) + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, t5_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = T5Config.from_json_file(t5_config_file) + print("Building PyTorch model from configuration: {}".format(str(config))) + model = T5ForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_t5(model, config, tf_checkpoint_path) + + # Save pytorch-model + print("Save PyTorch model to {}".format(pytorch_dump_path)) + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--tf_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path to the TensorFlow checkpoint path.") + parser.add_argument("--t5_config_file", + default = None, + type = str, + required = True, + help = "The config json file corresponding to the pre-trained T5 model. \n" + "This specifies the model architecture.") + parser.add_argument("--pytorch_dump_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, + args.t5_config_file, + args.pytorch_dump_path) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py new file mode 100644 index 0000000000..fa3c22f24b --- /dev/null +++ b/transformers/modeling_t5.py @@ -0,0 +1,373 @@ +# coding=utf-8 +# Copyright 2018 T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch T5 model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import torch +from torch import nn +from torch.nn import CrossEntropyLoss, MSELoss + +from .modeling_utils import PreTrainedModel, prune_linear_layer +from .configuration_t5 import T5Config +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +#################################################### +# This dict contrains shortcut names and associated url +# for the pretrained weights provided with the models +#################################################### +T5_PRETRAINED_MODEL_ARCHIVE_MAP = { + 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-pytorch_model.bin", + 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-pytorch_model.bin", +} + +#################################################### +# This is a conversion method from TF 1.0 to PyTorch +# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28 +#################################################### +def load_tf_weights_in_t5(model, config, tf_checkpoint_path): + """ Load tf checkpoints in a pytorch model. + """ + try: + import re + import numpy as np + import tensorflow as tf + except ImportError: + logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info("Converting TensorFlow checkpoint from {}".format(tf_path)) + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info("Loading TF weight {} with shape {}".format(name, shape)) + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split('/') + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any(n in ["adam_v", "adam_m", "global_step"] for n in name): + logger.info("Skipping {}".format("/".join(name))) + continue + pointer = model + for m_name in name: + if re.fullmatch(r'[A-Za-z]+_\d+', m_name): + l = re.split(r'_(\d+)', m_name) + else: + l = [m_name] + if l[0] == 'kernel' or l[0] == 'gamma': + pointer = getattr(pointer, 'weight') + elif l[0] == 'output_bias' or l[0] == 'beta': + pointer = getattr(pointer, 'bias') + elif l[0] == 'output_weights': + pointer = getattr(pointer, 'weight') + elif l[0] == 'squad': + pointer = getattr(pointer, 'classifier') + else: + try: + pointer = getattr(pointer, l[0]) + except AttributeError: + logger.info("Skipping {}".format("/".join(name))) + continue + if len(l) >= 2: + num = int(l[1]) + pointer = pointer[num] + if m_name[-11:] == '_embeddings': + pointer = getattr(pointer, 'weight') + elif m_name == 'kernel': + array = np.transpose(array) + try: + assert pointer.shape == array.shape + except AssertionError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info("Initialize PyTorch weight {}".format(name)) + pointer.data = torch.from_numpy(array) + return model + + +#################################################### +# PyTorch Models are constructed by sub-classing +# - torch.nn.Module for the layers and +# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) +#################################################### + +class T5Layer(nn.Module): + def __init__(self, config): + super(T5Layer, self).__init__() + self.attention = T5Attention(config) + self.intermediate = T5Intermediate(config) + self.output = T5Output(config) + + def forward(self, hidden_states, attention_mask=None, head_mask=None): + attention_outputs = self.attention(hidden_states, attention_mask, head_mask) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + + +class T5PreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = T5Config + pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_t5 + base_model_prefix = "transformer" + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +T5_START_DOCSTRING = r""" The T5 model was proposed in + `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ + by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. + It's an encoder decoder pre-trained transformer. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: + https://arxiv.org/abs/1910.10683 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +T5_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + T5 is a model with relative position embeddings so you should be able to pad the inputs on + the right or the left. + + Indices can be obtained using :class:`transformers.T5Tokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare single stack (encoder or decoder) of a T5 Model transformer outputting raw hidden-states" + "without any specific head on top.", + T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +class T5Model(T5PreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased') + model = T5Model.from_pretrained('t5-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config): + super(T5Model, self).__init__(config) + + self.embeddings = T5Embeddings(config) + self.encoder = T5Encoder(config) + self.pooler = T5Pooler(config) + + self.init_weights() + + @property + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, new_embeddings): + self.embeddings.word_embeddings = new_embeddings + + def _prune_heads(self, heads_to_prune): + """ Prunes heads of the model. + heads_to_prune: dict of {layer_num: list of heads to prune in this layer} + See base class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + ################################## + # Replace this with your model code + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) + sequence_output = encoder_outputs[0] + outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + + return outputs # sequence_output, (hidden_states), (attentions) + + +@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, + T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +class T5WithLMHead(T5PreTrainedModel): + r""" + **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased') + model = T5ForMaskedLM.from_pretrained('t5-base-uncased') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + def __init__(self, config): + super(T5ForMaskedLM, self).__init__(config) + + self.transformer = T5Model(config) + self.lm_head = nn.Linear(config.n_embd, config.vocab_size) + + self.init_weights() + + def get_output_embeddings(self): + return self.lm_head + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + lm_labels=None): + + outputs = self.transformer(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + lm_logits = self.cls(sequence_output) + + outputs = (lm_logits,) + outputs[2:] # Add hidden states and attention if they are here + if lm_labels is not None: + shift_logits = lm_logits[..., :-1, :].contiguous() + shift_labels = lm_labels[..., 1:].contiguous() + loss_fct = CrossEntropyLoss(ignore_index=-1) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), + shift_labels.view(-1)) + outputs = (loss,) + outputs + + return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py new file mode 100644 index 0000000000..deb453846c --- /dev/null +++ b/transformers/modeling_tf_t5.py @@ -0,0 +1,496 @@ +# coding=utf-8 +# Copyright 2018 T5 Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" TF 2.0 T5 model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals + +import json +import logging +import math +import os +import sys +from io import open + +import numpy as np +import tensorflow as tf + +from .configuration_t5 import T5Config +from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +#################################################### +# This dict contrains shortcut names and associated url +# for the pretrained weights provided with the models +#################################################### +TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { + 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-tf_model.h5", + 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-tf_model.h5", +} + +#################################################### +# TF 2.0 Models are constructed using Keras imperative API by sub-classing +# - tf.keras.layers.Layer for the layers and +# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) +#################################################### + +#################################################### +# Here is an example of typical layer in a TF 2.0 model of the library +# The classes are usually identical to the PyTorch ones and prefixed with 'TF'. +# +# Note that class __init__ parameters includes **kwargs (send to 'super'). +# This let us have a control on class scope and variable names: +# More precisely, we set the names of the class attributes (lower level layers) to +# to the equivalent attributes names in the PyTorch model so we can have equivalent +# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other. +# +# See the conversion methods in modeling_tf_pytorch_utils.py for more details +#################################################### +class TFT5Layer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super(TFT5Layer, self).__init__(**kwargs) + self.attention = TFT5Attention(config, name='attention') + self.intermediate = TFT5Intermediate(config, name='intermediate') + self.transformer_output = TFT5Output(config, name='output') + + def call(self, inputs, training=False): + hidden_states, attention_mask, head_mask = inputs + + attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) + attention_output = attention_outputs[0] + intermediate_output = self.intermediate(attention_output) + layer_output = self.transformer_output([intermediate_output, attention_output], training=training) + outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + return outputs + + +#################################################### +# The full model without a specific pretrained or finetuning head is +# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer" +#################################################### +class TFT5MainLayer(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super(TFT5MainLayer, self).__init__(**kwargs) + + def _resize_token_embeddings(self, new_num_tokens): + raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models + + def _prune_heads(self, heads_to_prune): + raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models + + def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): + # We allow three types of multi-inputs: + # - traditional keyword arguments in the call method + # - all the arguments provided as a dict in the first positional argument of call + # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call + # The last two options are useful to use the tf.keras fit() method. + + if isinstance(inputs, (tuple, list)): + input_ids = inputs[0] + attention_mask = inputs[1] if len(inputs) > 1 else attention_mask + token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids + position_ids = inputs[3] if len(inputs) > 3 else position_ids + head_mask = inputs[4] if len(inputs) > 4 else head_mask + assert len(inputs) <= 5, "Too many inputs." + elif isinstance(inputs, dict): + input_ids = inputs.get('input_ids') + attention_mask = inputs.get('attention_mask', attention_mask) + token_type_ids = inputs.get('token_type_ids', token_type_ids) + position_ids = inputs.get('position_ids', position_ids) + head_mask = inputs.get('head_mask', head_mask) + assert len(inputs) <= 5, "Too many inputs." + else: + input_ids = inputs + + if attention_mask is None: + attention_mask = tf.fill(tf.shape(input_ids), 1) + if token_type_ids is None: + token_type_ids = tf.fill(tf.shape(input_ids), 0) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + + extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if not head_mask is None: + raise NotImplementedError + else: + head_mask = [None] * self.num_hidden_layers + # head_mask = tf.constant([0] * self.num_hidden_layers) + + ################################## + # Replace this with your model code + embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) + encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) + sequence_output = encoder_outputs[0] + outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + + return outputs # sequence_output, (hidden_states), (attentions) + + +#################################################### +# TFT5PreTrainedModel is a sub-class of tf.keras.Model +# which take care of loading and saving pretrained weights +# and various common utilities. +# Here you just need to specify a few (self-explanatory) +# pointers for your model. +#################################################### +class TFT5PreTrainedModel(TFPreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = T5Config + pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "transformer" + + +T5_START_DOCSTRING = r""" The XXX model was proposed in + `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ + by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer + pre-trained using a combination of masked language modeling objective and next sentence prediction + on a large corpus comprising the Toronto Book Corpus and Wikipedia. + + This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and + refer to the TF 2.0 documentation for all matter related to general usage and behavior. + + .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`: + https://arxiv.org/abs/1810.04805 + + .. _`tf.keras.Model`: + https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model + + Note on the model inputs: + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument : + + - a single Tensor with input_ids only and nothing else: `model(inputs_ids) + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associaed to the input names given in the docstring: + `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` + + Parameters: + config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +XXX_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + Indices can be obtained using :class:`transformers.XxxTokenizer`. + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + Segment token indices to indicate first and second portions of the inputs. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1]``. + **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. +""" + +@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.", + XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +class TFXxxModel(TFXxxPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + objective during Xxx pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import XxxTokenizer, TFXxxModel + + tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') + model = TFXxxModel.from_pretrained('xxx-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXxxModel, self).__init__(config, *inputs, **kwargs) + self.transformer = TFXxxMainLayer(config, name='transformer') + + def call(self, inputs, **kwargs): + outputs = self.transformer(inputs, **kwargs) + return outputs + + +@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, + XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +class TFXxxForMaskedLM(TFXxxPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import XxxTokenizer, TFXxxForMaskedLM + + tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') + model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + prediction_scores = outputs[0] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs) + + self.transformer = TFXxxMainLayer(config, name='transformer') + self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm') + + def call(self, inputs, **kwargs): + outputs = self.transformer(inputs, **kwargs) + + sequence_output = outputs[0] + prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False)) + + outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + + return outputs # prediction_scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of + the pooled output) e.g. for GLUE tasks. """, + XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +class TFXxxForSequenceClassification(TFXxxPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import XxxTokenizer, TFXxxForSequenceClassification + + tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') + model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + logits = outputs[0] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFXxxMainLayer(config, name='transformer') + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='classifier') + + def call(self, inputs, **kwargs): + outputs = self.transformer(inputs, **kwargs) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) + logits = self.classifier(pooled_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # logits, (hidden_states), (attentions) + + +@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +class TFXxxForTokenClassification(TFXxxPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import XxxTokenizer, TFXxxForTokenClassification + + tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') + model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + scores = outputs[0] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFXxxMainLayer(config, name='transformer') + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='classifier') + + def call(self, inputs, **kwargs): + outputs = self.transformer(inputs, **kwargs) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) + logits = self.classifier(sequence_output) + + outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here + + return outputs # scores, (hidden_states), (attentions) + + +@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) +class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + import tensorflow as tf + from transformers import XxxTokenizer, TFXxxForQuestionAnswering + + tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') + model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased') + input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 + outputs = model(input_ids) + start_scores, end_scores = outputs[:2] + + """ + def __init__(self, config, *inputs, **kwargs): + super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.transformer = TFXxxMainLayer(config, name='transformer') + self.qa_outputs = tf.keras.layers.Dense(config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name='qa_outputs') + + def call(self, inputs, **kwargs): + outputs = self.transformer(inputs, **kwargs) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + + return outputs # start_logits, end_logits, (hidden_states), (attentions) diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py new file mode 100644 index 0000000000..3f8f4bf556 --- /dev/null +++ b/transformers/tokenization_t5.py @@ -0,0 +1,214 @@ +# coding=utf-8 +# Copyright 2018 T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Tokenization class for model T5.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to file names for serializing Tokenizer instances +#################################################### +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +#################################################### +# Mapping from the keyword arguments names of Tokenizer `__init__` +# to pretrained vocabulary URL for all the model shortcut names. +#################################################### +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-vocab.txt", + 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-vocab.txt", + } +} + +#################################################### +# Mapping from model shortcut names to max length of inputs +#################################################### +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 't5-base-uncased': 512, + 't5-large-uncased': 512, +} + +#################################################### +# Mapping from model shortcut names to a dictionary of additional +# keyword arguments for Tokenizer `__init__`. +# To be used for checkpoint specific configurations. +#################################################### +PRETRAINED_INIT_CONFIGURATION = { + 't5-base-uncased': {'do_lower_case': True}, + 't5-large-uncased': {'do_lower_case': True}, +} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip('\n') + vocab[token] = index + return vocab + + +class T5Tokenizer(PreTrainedTokenizer): + r""" + Constructs a T5Tokenizer. + :class:`~transformers.T5Tokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece + + Args: + vocab_file: Path to a one-wordpiece-per-line vocabulary file + do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + """ + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, do_lower_case=True, + unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", + mask_token="[MASK]", **kwargs): + """Constructs a T5Tokenizer. + + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input + Only has an effect when do_basic_tokenize=True + """ + super(T5Tokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, + pad_token=pad_token, cls_token=cls_token, + mask_token=mask_token, **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + + @property + def vocab_size(self): + return len(self.vocab) + + def _tokenize(self, text): + """ Take as input a string and return a list of strings (tokens) for words/sub-words + """ + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): + for sub_token in self.wordpiece_tokenizer.tokenize(token): + split_tokens.append(sub_token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """ Converts a sequence of tokens (string) in a single string. """ + out_string = ' '.join(tokens).replace(' ##', '').strip() + return out_string + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A BERT sequence has the following format: + single sequence: [CLS] X [SEP] + pair of sequences: [CLS] A [SEP] B [SEP] + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A BERT sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, vocab_path): + """Save the tokenizer vocabulary to a directory or file.""" + index = 0 + if os.path.isdir(vocab_path): + vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) + else: + vocab_file = vocab_path + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!".format(vocab_file)) + index = token_index + writer.write(token + u'\n') + index += 1 + return (vocab_file,) From 568c0ffb7ef73555567f8bd467cf80c2b1e6ac13 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 5 Nov 2019 16:40:29 +0100 Subject: [PATCH 009/302] adding T5 model --- transformers/modeling_encoder_decoder.py | 4 +- transformers/modeling_t5.py | 471 ++++++++++++++++++++--- 2 files changed, 412 insertions(+), 63 deletions(-) diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index a884abd0a2..713cf5252e 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -217,9 +217,7 @@ class PreTrainedEncoderDecoder(nn.Module): encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) if encoder_hidden_states is None: encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) - encoder_hidden_states = encoder_outputs[ - 0 - ] # output the last layer hidden state + encoder_hidden_states = encoder_outputs[0] else: encoder_outputs = () diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index fa3c22f24b..d93e96211d 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2018 T5 Authors and HuggingFace Inc. team. +# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -20,11 +20,14 @@ import json import logging import math import os +import math import sys +import itertools from io import open import torch from torch import nn +import torch.nn.functional as F from torch.nn import CrossEntropyLoss, MSELoss from .modeling_utils import PreTrainedModel, prune_linear_layer @@ -119,31 +122,389 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) #################################################### -class T5Layer(nn.Module): +class T5DenseReluDense(nn.Module): def __init__(self, config): - super(T5Layer, self).__init__() - self.attention = T5Attention(config) - self.intermediate = T5Intermediate(config) - self.output = T5Output(config) + super(T5DenseReluDense, self).__init__() + self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) + self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) + self.dropout = nn.Dropout(config.dropout) - def forward(self, hidden_states, attention_mask=None, head_mask=None): - attention_outputs = self.attention(hidden_states, attention_mask, head_mask) - attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.output(intermediate_output, attention_output) - outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + def forward(self, hidden_states): + h = self.wi(hidden_states) + h = F.relu(h) + h = self.dropout(h) + h = self.wo(h) + return h + + +class T5LayerFF(nn.Module): + def __init__(self, config): + super(T5LayerFF, self).__init__() + self.DenseReluDense = T5DenseReluDense(config) + self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, hidden_states): + norm_x = self.layer_norm(hidden_states) + y = self.DenseReluDense(norm_x) + layer_output = hidden_states + self.dropout(y) + return layer_output + + +class T5Attention(nn.Module): + NEW_ID = itertools.count() + + def __init__(self, config): + super(T5Attention, self).__init__() + self.layer_id = next(T5Attention.NEW_ID) + + self.output_attentions = config.output_attentions + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.dim = config.d_model + self.n_heads = config.num_heads + self.dropout = config.dropout_rate + assert self.dim % self.n_heads == 0 + + self.q = nn.Linear(self.dim, self.dim, bias=False) + self.k = nn.Linear(self.dim, self.dim, bias=False) + self.v = nn.Linear(self.dim, self.dim, bias=False) + self.o = nn.Linear(self.dim, self.dim, bias=False) + + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + self.pruned_heads = set() + + def prune_heads(self, heads): + attention_head_size = self.dim // self.n_heads + if len(heads) == 0: + return + mask = torch.ones(self.n_heads, attention_head_size) + heads = set(heads) - self.pruned_heads + for head in heads: + head -= sum(1 if h < head else 0 for h in self.pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index = torch.arange(len(mask))[mask].long() + # Prune linear layers + self.q = prune_linear_layer(self.q, index) + self.k = prune_linear_layer(self.k, index) + self.v = prune_linear_layer(self.v, index) + self.o = prune_linear_layer(self.o, index, dim=1) + # Update hyper params + self.n_heads = self.n_heads - len(heads) + self.dim = attention_head_size * self.n_heads + self.pruned_heads = self.pruned_heads.union(heads) + + @staticmethod + def _relative_position_bucket(relative_position, + bidirectional=True, + num_buckets=32, + max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. + The relative position is defined as memory_position - query_position, i.e. + the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are + invalid. + We use smaller buckets for small absolute relative_position and larger buckets + for larger absolute relative_positions. All relative positions >=max_distance + map to the same bucket. All relative positions <=-max_distance map to the + same bucket. This should allow for more graceful generalization to longer + sequences than the model has been trained on. + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + Returns: + a Tensor with the same shape as relative_position, containing int32 + values in the range [0, num_buckets) + """ + ret = 0 + n = -relative_position + if bidirectional: + num_buckets //= 2 + ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets + n = torch.abs(n) + else: + n = torch.max(n, 0) + # now n is in the range [0, inf) + + # half of the buckets are for exact increments in positions + max_exact = num_buckets // 2 + is_small = (n < max_exact) + + # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance + val_if_large = max_exact + ( + torch.log(n.float() / max_exact) + / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long) + val_if_large = torch.min(val_if_large, num_buckets - 1) + + ret += torch.where(is_small, n, val_if_large) + return ret + + def compute_bias(self, qlen, klen): + """ Compute binned relative position bias """ + context_position = torch.arange(qlen, dtype=torch.long)[:, None] + memory_position = torch.arange(klen, dtype=torch.long)[None, :] + relative_position = memory_position - context_position # shape (qlen, klen) + rp_bucket = self._relative_position_bucket(relative_position, + bidirectional=not self.is_decoder, + num_buckets=self.relative_attention_num_buckets) + values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) + values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) + return values + + def forward(self, input, mask, kv=None, position_bias=None, cache=None, head_mask=None): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = input.size() + if kv is None: + klen = qlen if cache is None else cache['slen'] + qlen + else: + klen = kv.size(1) + # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) + n_heads = self.n_heads + dim_per_head = self.dim // n_heads + mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) + + def shape(x): + """ projection """ + return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + + def unshape(x): + """ compute context """ + return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + + q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) + if kv is None: + k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = torch.cat([k_, k], dim=2) # (bs, n_heads, klen, dim_per_head) + v = torch.cat([v_, v], dim=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + cache[self.layer_id] = (k, v) + + # q = q / math.sqrt(dim_per_head) # No scaling in T5 + scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + + if position_bias is None: + position_bias = self.compute_bias(qlen, klen) + scores += position_bias + + mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + + weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) + weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = torch.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + + context = self.o(context) + + outputs = (context,) + if self.output_attentions: + outputs = outputs + (weights,) return outputs +class T5LayerSelfAttention(nn.Module): + def __init__(self, config): + super(T5LayerSelfAttention, self).__init__() + self.SelfAttention = T5Attention(config) + self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout) -class T5PreTrainedModel(PreTrainedModel): + def forward(self, hidden_states, attention_mask=None, head_mask=None): + norm_x = self.layer_norm(hidden_states) + attention_output = self.SelfAttention(norm_x, + attention_mask=attention_mask, + head_mask=head_mask) + y = attention_output[0] + layer_output = hidden_states + self.dropout(y) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5LayerCrossAttention(nn.Module): + def __init__(self, config): + super(T5LayerCrossAttention, self).__init__() + self.EncDecAttention = T5Attention(config) + self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, hidden_states, kv, attention_mask=None, head_mask=None): + norm_x = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention(norm_x, + kv=kv, + attention_mask=attention_mask, + head_mask=head_mask) + y = attention_output[0] + layer_output = hidden_states + self.dropout(y) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class T5Block(nn.Module): + def __init__(self, config): + super(T5Block, self).__init__() + self.is_decoder = config.is_decoder + self.layer_000 = T5LayerSelfAttention(config) + if self.is_decoder: + self.layer_001 = T5LayerCrossAttention(config) + self.layer_002 = T5LayerFF(config) + else: + self.layer_001 = T5LayerFF(config) + + def forward(self, hidden_states, attention_mask=None, + encoder_hidden_states=None, encoder_attention_mask=None, head_mask=None): + self_attention_outputs = self.layer_000(hidden_states, + attention_mask=attention_mask, + head_mask=head_mask) + hidden_states = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + + if self.is_decoder: + cross_attention_outputs = self.layer_001(hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + head_mask=head_mask) + hidden_states = cross_attention_outputs[0] + outputs = cross_attention_outputs[1:] + outputs + hidden_states = self.layer_002(hidden_states) + else: + hidden_states = self.layer_001(hidden_states) + + outputs = (hidden_states,) + outputs # add attentions if we output them + return outputs + + +class T5Stack(nn.Module): + def __init__(self, config): + super(T5Stack, self).__init__() + self.blocks = nn.ModuleList([T5Block(config) for _ in range(config.num_layers)]) + self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout) + + def forward(self, + hidden_states, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None): + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if attention_mask.dim() == 2: + if self.config.is_decoder: + batch_size, seq_length = input_ids.size() + seq_ids = torch.arange(seq_length, device=input_ids.device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + else: + head_mask = [None] * self.config.num_hidden_layers + + all_hidden_states = () + all_attentions = () + position_bias = None + for i, layer_module in enumerate(self.layer): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_outputs = layer_module(hidden_states, + attention_mask=extended_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + head_mask=head_mask[i]) + hidden_states = layer_outputs[0] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.final_layer_norm(hidden_states) + layer_output = self.dropout(hidden_states) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) + + +class T5PreTrainedModel(PreTrainedEncoderDecoder): """ An abstract class to handle weights initialization and a simple interface for dowloading and loading pretrained models. """ config_class = T5Config pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP load_tf_weights = load_tf_weights_in_t5 - base_model_prefix = "transformer" def _init_weights(self, module): """ Initialize the weights """ @@ -238,19 +599,23 @@ class T5Model(T5PreTrainedModel): """ def __init__(self, config): super(T5Model, self).__init__(config) + self.shared = nn.Embeddings(config.vocab_size, config.d_model) - self.embeddings = T5Embeddings(config) - self.encoder = T5Encoder(config) - self.pooler = T5Pooler(config) + encoder_config = copy.deepcopy(config) + self.encoder = T5Stack(encoder_config) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + self.decoder = T5Stack(decoder_config) self.init_weights() @property def get_input_embeddings(self): - return self.embeddings.word_embeddings + return self.shared def set_input_embeddings(self, new_embeddings): - self.embeddings.word_embeddings = new_embeddings + self.shared = new_embeddings def _prune_heads(self, heads_to_prune): """ Prunes heads of the model. @@ -260,50 +625,36 @@ class T5Model(T5PreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None): - if attention_mask is None: - attention_mask = torch.ones_like(input_ids) - if token_type_ids is None: - token_type_ids = torch.zeros_like(input_ids) + def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): + # keyword arguments come in 3 flavors: encoder-specific (prefixed by + # `encoder_`), decoder-specific (prefixed by `decoder_`) and those + # that apply to the model as whole. + # We let the specific kwargs override the common ones in case of conflict. + kwargs_common = dict((k, v) for k, v in kwargs.items() + if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_decoder = kwargs_common.copy() + kwargs_encoder = kwargs_common.copy() + kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) - - # Since attention_mask is 1.0 for positions we want to attend and 0.0 for - # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. - # Since we are adding it to the raw scores before the softmax, this is - # effectively the same as removing these entirely. - extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - - # Prepare head mask if needed - # 1.0 in head_mask indicate we keep the head - # attention_probs has shape bsz x n_heads x N x N - # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] - # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] - if head_mask is not None: - if head_mask.dim() == 1: - head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) - head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) - elif head_mask.dim() == 2: - head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer - head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] else: - head_mask = [None] * self.config.num_hidden_layers + encoder_outputs = () - ################################## - # Replace this with your model code - embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask) - sequence_output = encoder_outputs[0] - outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + # Decode + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) + decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) - return outputs # sequence_output, (hidden_states), (attentions) + return decoder_outputs + encoder_outputs @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, @@ -342,7 +693,7 @@ class T5WithLMHead(T5PreTrainedModel): super(T5ForMaskedLM, self).__init__(config) self.transformer = T5Model(config) - self.lm_head = nn.Linear(config.n_embd, config.vocab_size) + self.lm_head = nn.Linear(config.d_model, config.vocab_size) self.init_weights() From 88e5bef58f34dca87f28ab489fdecbeaaef8b316 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 5 Nov 2019 17:02:52 +0100 Subject: [PATCH 010/302] share position biases --- transformers/modeling_t5.py | 65 +++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 25 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index d93e96211d..e1a1d019ff 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -154,9 +154,10 @@ class T5LayerFF(nn.Module): class T5Attention(nn.Module): NEW_ID = itertools.count() - def __init__(self, config): + def __init__(self, config, has_relative_attention_bias=False): super(T5Attention, self).__init__() self.layer_id = next(T5Attention.NEW_ID) + self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets @@ -170,7 +171,8 @@ class T5Attention(nn.Module): self.v = nn.Linear(self.dim, self.dim, bias=False) self.o = nn.Linear(self.dim, self.dim, bias=False) - self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) self.pruned_heads = set() def prune_heads(self, heads): @@ -304,6 +306,8 @@ class T5Attention(nn.Module): scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) if position_bias is None: + if not self.has_relative_attention_bias: + raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) scores += position_bias @@ -325,20 +329,23 @@ class T5Attention(nn.Module): outputs = (context,) if self.output_attentions: outputs = outputs + (weights,) + if self.has_relative_attention_bias: + outputs = outputs + (position_bias,) return outputs class T5LayerSelfAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, has_relative_attention_bias=False): super(T5LayerSelfAttention, self).__init__() - self.SelfAttention = T5Attention(config) + self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout) - def forward(self, hidden_states, attention_mask=None, head_mask=None): + def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention(norm_x, attention_mask=attention_mask, + position_bias=position_bias, head_mask=head_mask) y = attention_output[0] layer_output = hidden_states + self.dropout(y) @@ -347,17 +354,18 @@ class T5LayerSelfAttention(nn.Module): class T5LayerCrossAttention(nn.Module): - def __init__(self, config): + def __init__(self, config, has_relative_attention_bias=False): super(T5LayerCrossAttention, self).__init__() - self.EncDecAttention = T5Attention(config) + self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout) - def forward(self, hidden_states, kv, attention_mask=None, head_mask=None): + def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention(norm_x, kv=kv, attention_mask=attention_mask, + position_bias=position_bias, head_mask=head_mask) y = attention_output[0] layer_output = hidden_states + self.dropout(y) @@ -366,20 +374,22 @@ class T5LayerCrossAttention(nn.Module): class T5Block(nn.Module): - def __init__(self, config): + def __init__(self, config, has_relative_attention_bias=False): super(T5Block, self).__init__() self.is_decoder = config.is_decoder - self.layer_000 = T5LayerSelfAttention(config) + self.layer_000 = T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) if self.is_decoder: - self.layer_001 = T5LayerCrossAttention(config) + self.layer_001 = T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias) self.layer_002 = T5LayerFF(config) else: self.layer_001 = T5LayerFF(config) - def forward(self, hidden_states, attention_mask=None, - encoder_hidden_states=None, encoder_attention_mask=None, head_mask=None): + def forward(self, hidden_states, attention_mask=None, position_bias=None, + encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, + head_mask=None): self_attention_outputs = self.layer_000(hidden_states, attention_mask=attention_mask, + position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] outputs = self_attention_outputs[1:] @@ -388,6 +398,7 @@ class T5Block(nn.Module): cross_attention_outputs = self.layer_001(hidden_states, kv=encoder_hidden_states, attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, head_mask=head_mask) hidden_states = cross_attention_outputs[0] outputs = cross_attention_outputs[1:] + outputs @@ -402,7 +413,8 @@ class T5Block(nn.Module): class T5Stack(nn.Module): def __init__(self, config): super(T5Stack, self).__init__() - self.blocks = nn.ModuleList([T5Block(config) for _ in range(config.num_layers)]) + self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) + for i in range(config.num_layers)]) self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout) @@ -413,8 +425,12 @@ class T5Stack(nn.Module): encoder_attention_mask=None, head_mask=None): + batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1] + encoder_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0 if attention_mask is None: - attention_mask = torch.ones_like(input_ids) + attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] # ourselves in which case we just need to make it broadcastable to all heads. @@ -426,8 +442,7 @@ class T5Stack(nn.Module): # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] if attention_mask.dim() == 2: if self.config.is_decoder: - batch_size, seq_length = input_ids.size() - seq_ids = torch.arange(seq_length, device=input_ids.device) + seq_ids = torch.arange(seq_length, device=hidden_states.device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: @@ -469,16 +484,22 @@ class T5Stack(nn.Module): all_hidden_states = () all_attentions = () position_bias = None + encoder_decoder_position_bias = None for i, layer_module in enumerate(self.layer): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) layer_outputs = layer_module(hidden_states, attention_mask=extended_attention_mask, + position_bias=position_bias, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i]) hidden_states = layer_outputs[0] + if i == 0: + position_bias = layer_outputs[2] if len(layer_outputs) > 3 else None + encoder_decoder_position_bias = layer_outputs[4] if len(layer_outputs) > 5 else None if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) @@ -700,14 +721,8 @@ class T5WithLMHead(T5PreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, - lm_labels=None): - - outputs = self.transformer(input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - position_ids=position_ids, - head_mask=head_mask) + def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): + outputs = self.transformer(encoder_input_ids, decoder_input_ids, **kwargs) sequence_output = outputs[0] lm_logits = self.cls(sequence_output) From 3835e1e651ebeeddaa8dd8cb5f4d30912ec5ec6d Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 6 Nov 2019 11:52:29 +0100 Subject: [PATCH 011/302] adding tokenizer --- transformers/tokenization_t5.py | 188 +++++++++----------------------- 1 file changed, 51 insertions(+), 137 deletions(-) diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 3f8f4bf556..cff6a41baf 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -16,16 +16,15 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import collections import logging import os -import unicodedata -from io import open from .tokenization_utils import PreTrainedTokenizer logger = logging.getLogger(__name__) +SPIECE_UNDERLINE = u'▁' + #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances @@ -39,8 +38,7 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { - 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-vocab.txt", - 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-vocab.txt", + 't5': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } @@ -48,167 +46,83 @@ PRETRAINED_VOCAB_FILES_MAP = { # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 't5-base-uncased': 512, - 't5-large-uncased': 512, + 't5': 512, } -#################################################### -# Mapping from model shortcut names to a dictionary of additional -# keyword arguments for Tokenizer `__init__`. -# To be used for checkpoint specific configurations. -#################################################### -PRETRAINED_INIT_CONFIGURATION = { - 't5-base-uncased': {'do_lower_case': True}, - 't5-large-uncased': {'do_lower_case': True}, -} - - -def load_vocab(vocab_file): - """Loads a vocabulary file into a dictionary.""" - vocab = collections.OrderedDict() - with open(vocab_file, "r", encoding="utf-8") as reader: - tokens = reader.readlines() - for index, token in enumerate(tokens): - token = token.rstrip('\n') - vocab[token] = index - return vocab - - class T5Tokenizer(PreTrainedTokenizer): - r""" - Constructs a T5Tokenizer. - :class:`~transformers.T5Tokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece - - Args: - vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False """ + SentencePiece based tokenizer. Peculiarities: + - requires `SentencePiece `_ + """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP - pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - def __init__(self, vocab_file, do_lower_case=True, - unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]", - mask_token="[MASK]", **kwargs): - """Constructs a T5Tokenizer. + def __init__(self, vocab_file, eos_token="", unk_token="", + pad_token="", **kwargs): + super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token, + pad_token=pad_token, **kwargs) - Args: - **vocab_file**: Path to a one-wordpiece-per-line vocabulary file - **do_lower_case**: (`optional`) boolean (default True) - Whether to lower case the input - Only has an effect when do_basic_tokenize=True - """ - super(T5Tokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, - pad_token=pad_token, cls_token=cls_token, - mask_token=mask_token, **kwargs) - self.max_len_single_sentence = self.max_len - 2 # take into account special tokens - self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + try: + import sentencepiece as spm + except ImportError: + logger.warning("You need to install SentencePiece to use T5Tokenizer:" + "https://github.com/google/sentencepiece" + "pip install sentencepiece") - if not os.path.isfile(vocab_file): - raise ValueError( - "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " - "model use `tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) - self.vocab = load_vocab(vocab_file) + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(vocab_file) @property def vocab_size(self): - return len(self.vocab) + return self.sp_model.get_piece_size() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + try: + import sentencepiece as spm + except ImportError: + logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece" + "pip install sentencepiece") + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(self.vocab_file) def _tokenize(self, text): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ - split_tokens = [] - if self.do_basic_tokenize: - for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens): - for sub_token in self.wordpiece_tokenizer.tokenize(token): - split_tokens.append(sub_token) - else: - split_tokens = self.wordpiece_tokenizer.tokenize(text) - return split_tokens + return self.sp_model.EncodeAsPieces(text) def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ - return self.vocab.get(token, self.vocab.get(self.unk_token)) + return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index): """Converts an index (integer) in a token (string/unicode) using the vocab.""" - return self.ids_to_tokens.get(index, self.unk_token) + return self.sp_model.id_to_piece(index) def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ - out_string = ' '.join(tokens).replace(' ##', '').strip() + out_string = self.sp_model.decode_pieces(tokens) return out_string - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + def save_vocabulary(self, save_directory): + """ Save the sentencepiece vocabulary (copy original file) and special tokens file + to a directory. """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks - by concatenating and adding special tokens. - A BERT sequence has the following format: - single sequence: [CLS] X [SEP] - pair of sequences: [CLS] A [SEP] B [SEP] - """ - if token_ids_1 is None: - return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] - cls = [self.cls_token_id] - sep = [self.sep_token_id] - return cls + token_ids_0 + sep + token_ids_1 + sep + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): - """ - Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding - special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) - Args: - token_ids_0: list of ids (must not contain special tokens) - token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids - for sequence pairs - already_has_special_tokens: (default False) Set to True if the token list is already formated with - special tokens for the model - - Returns: - A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token. - """ - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError("You should not supply a second sequence if the provided sequence of " - "ids is already formated with special tokens for the model.") - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): - """ - Creates a mask from the two sequences passed to be used in a sequence-pair classification task. - A BERT sequence pair mask has the following format: - 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 - | first sequence | second sequence - - if token_ids_1 is None, only returns the first portion of the mask (0's). - """ - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_vocabulary(self, vocab_path): - """Save the tokenizer vocabulary to a directory or file.""" - index = 0 - if os.path.isdir(vocab_path): - vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file']) - else: - vocab_file = vocab_path - with open(vocab_file, "w", encoding="utf-8") as writer: - for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): - if index != token_index: - logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive." - " Please check that the vocabulary is not corrupted!".format(vocab_file)) - index = token_index - writer.write(token + u'\n') - index += 1 - return (vocab_file,) + return (out_vocab_file,) From 73f2c342f53f2ff02124da23ba029d80c386e7ce Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 6 Nov 2019 11:52:39 +0100 Subject: [PATCH 012/302] fixing template --- templates/adding_a_new_model/configuration_xxx.py | 2 +- templates/adding_a_new_model/modeling_xxx.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py index b1614e71af..14c1c2c79e 100644 --- a/templates/adding_a_new_model/configuration_xxx.py +++ b/templates/adding_a_new_model/configuration_xxx.py @@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs): super(XxxConfig, self).__init__(**kwargs) - self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1 + self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index ff64f13f40..ee705e753c 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -280,7 +280,6 @@ class XxxModel(XxxPreTrainedModel): self.init_weights() - @property def get_input_embeddings(self): return self.embeddings.word_embeddings From 076a207935bfcc38416cd0baa887d3e025ebef28 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 6 Nov 2019 11:52:50 +0100 Subject: [PATCH 013/302] adding tests and updating model --- transformers/__init__.py | 11 +- transformers/configuration_t5.py | 53 +++--- transformers/modeling_t5.py | 151 ++++++++-------- transformers/tests/modeling_common_test.py | 32 ++-- transformers/tests/modeling_t5_test.py | 176 +++++++++++++++++++ transformers/tests/modeling_tf_t5_test.py | 190 +++++++++++++++++++++ transformers/tests/tokenization_t5_test.py | 77 +++++++++ 7 files changed, 571 insertions(+), 119 deletions(-) create mode 100644 transformers/tests/modeling_t5_test.py create mode 100644 transformers/tests/modeling_tf_t5_test.py create mode 100644 transformers/tests/tokenization_t5_test.py diff --git a/transformers/__init__.py b/transformers/__init__.py index 53f3c39dc7..bf896276d6 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -42,6 +42,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer from .tokenization_roberta import RobertaTokenizer from .tokenization_distilbert import DistilBertTokenizer +from .tokenization_t5 import T5Tokenizer # Configurations from .configuration_utils import PretrainedConfig @@ -52,10 +53,10 @@ from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CON from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP # Modeling if is_torch_available(): @@ -69,10 +70,10 @@ if is_torch_available(): BertForTokenClassification, BertForQuestionAnswering, load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel, - OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, - load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) + OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, + load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, - load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) + load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) @@ -95,6 +96,8 @@ if is_torch_available(): DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model + from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel, + T5_PRETRAINED_MODEL_ARCHIVE_MAP) # Optimization from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule, diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index a37a5b2157..9db918e59f 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -64,44 +64,29 @@ class T5Config(PretrainedConfig): pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=50257, - n_positions=1024, - n_ctx=1024, - n_embd=768, - n_layer=12, - n_head=12, - resid_pdrop=0.1, - embd_pdrop=0.1, - attn_pdrop=0.1, - layer_norm_epsilon=1e-5, + vocab_size_or_config_json_file=32128, + n_positions=512, + d_model=512, + d_ff=2048, + num_layers=12, + num_heads=12, + relative_attention_num_buckets=32, + dropout_rate=0.1, + layer_norm_epsilon=1e-6, initializer_range=0.02, - - num_labels=1, - summary_type='cls_index', - summary_use_proj=True, - summary_activation=None, - summary_proj_to_labels=True, - summary_first_dropout=0.1, **kwargs): super(T5Config, self).__init__(**kwargs) - self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1 - self.n_ctx = n_ctx + self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop + self.d_model = d_model + self.d_ff = d_ff + self.num_layers = num_layers + self.num_heads = num_heads + self.relative_attention_num_buckets = relative_attention_num_buckets + self.dropout_rate = dropout_rate self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels if isinstance(vocab_size_or_config_json_file, six.string_types): with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: json_config = json.loads(reader.read()) @@ -119,12 +104,12 @@ class T5Config(PretrainedConfig): @property def hidden_size(self): - return self.n_embd + return self.d_model @property def num_attention_heads(self): - return self.n_head + return self.num_heads @property def num_hidden_layers(self): - return self.n_layer + return self.num_layers diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index e1a1d019ff..ce443cf882 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -20,8 +20,8 @@ import json import logging import math import os -import math import sys +import copy import itertools from io import open @@ -30,7 +30,7 @@ from torch import nn import torch.nn.functional as F from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import PreTrainedModel, prune_linear_layer +from .modeling_utils import PreTrainedModel from .configuration_t5 import T5Config from .file_utils import add_start_docstrings @@ -127,7 +127,7 @@ class T5DenseReluDense(nn.Module): super(T5DenseReluDense, self).__init__() self.wi = nn.Linear(config.d_model, config.d_ff, bias=False) self.wo = nn.Linear(config.d_ff, config.d_model, bias=False) - self.dropout = nn.Dropout(config.dropout) + self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): h = self.wi(hidden_states) @@ -141,8 +141,8 @@ class T5LayerFF(nn.Module): def __init__(self, config): super(T5LayerFF, self).__init__() self.DenseReluDense = T5DenseReluDense(config) - self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout) + self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): norm_x = self.layer_norm(hidden_states) @@ -157,6 +157,7 @@ class T5Attention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5Attention, self).__init__() self.layer_id = next(T5Attention.NEW_ID) + self.is_decoder = config.is_decoder self.has_relative_attention_bias = has_relative_attention_bias self.output_attentions = config.output_attentions @@ -231,7 +232,7 @@ class T5Attention(nn.Module): ret += (n < 0).to(torch.long) * num_buckets # mtf.to_int32(mtf.less(n, 0)) * num_buckets n = torch.abs(n) else: - n = torch.max(n, 0) + n = torch.max(n, torch.zeros_like(n)) # now n is in the range [0, inf) # half of the buckets are for exact increments in positions @@ -242,7 +243,7 @@ class T5Attention(nn.Module): val_if_large = max_exact + ( torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long) - val_if_large = torch.min(val_if_large, num_buckets - 1) + val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1)) ret += torch.where(is_small, n, val_if_large) return ret @@ -259,7 +260,7 @@ class T5Attention(nn.Module): values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen) return values - def forward(self, input, mask, kv=None, position_bias=None, cache=None, head_mask=None): + def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None): """ Self-attention (if kv is None) or attention over source sentence (provided by kv). """ @@ -273,7 +274,6 @@ class T5Attention(nn.Module): # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) n_heads = self.n_heads dim_per_head = self.dim // n_heads - mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen) def shape(x): """ projection """ @@ -311,8 +311,9 @@ class T5Attention(nn.Module): position_bias = self.compute_bias(qlen, klen) scores += position_bias - mask = (mask == 0).view(mask_reshape).expand_as(scores) # (bs, n_heads, qlen, klen) - scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + if mask is not None: + mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) + scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) @@ -338,13 +339,13 @@ class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5LayerSelfAttention, self).__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout) + self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) attention_output = self.SelfAttention(norm_x, - attention_mask=attention_mask, + mask=attention_mask, position_bias=position_bias, head_mask=head_mask) y = attention_output[0] @@ -357,14 +358,14 @@ class T5LayerCrossAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5LayerCrossAttention, self).__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout) + self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None): norm_x = self.layer_norm(hidden_states) attention_output = self.EncDecAttention(norm_x, + mask=attention_mask, kv=kv, - attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask) y = attention_output[0] @@ -410,13 +411,41 @@ class T5Block(nn.Module): return outputs -class T5Stack(nn.Module): +class T5PreTrainedModel(PreTrainedModel): + """ An abstract class to handle weights initialization and + a simple interface for dowloading and loading pretrained models. + """ + config_class = T5Config + pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP + load_tf_weights = load_tf_weights_in_t5 + base_model_prefix = "transformer" + + def _init_weights(self, module): + """ Initialize the weights """ + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class T5Stack(T5PreTrainedModel): def __init__(self, config): - super(T5Stack, self).__init__() + super(T5Stack, self).__init__(config) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.is_decoder = config.is_decoder + self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]) - self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon) - self.dropout = nn.Dropout(config.dropout) + self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.dropout = nn.Dropout(config.dropout_rate) + + self.init_weights() def forward(self, hidden_states, @@ -426,10 +455,10 @@ class T5Stack(nn.Module): head_mask=None): batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1] - encoder_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0 if attention_mask is None: attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device) - if encoder_attention_mask is None: + if self.is_decoder and encoder_attention_mask is None: + encoder_seq_length = encoder_hidden_states.shape[1] encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device) # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] @@ -444,6 +473,7 @@ class T5Stack(nn.Module): if self.config.is_decoder: seq_ids = torch.arange(seq_length, device=hidden_states.device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + causal_mask = causal_mask.to(attention_mask) extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] else: extended_attention_mask = attention_mask[:, None, None, :] @@ -456,15 +486,18 @@ class T5Stack(nn.Module): extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 - # If a 2D ou 3D attention mask is provided for the cross-attention - # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] - if encoder_attention_mask.dim() == 3: - encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] - if encoder_attention_mask.dim() == 2: - encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + if self.is_decoder: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] - encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head @@ -474,18 +507,18 @@ class T5Stack(nn.Module): if head_mask is not None: if head_mask.dim() == 1: head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) - head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1) + head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1) elif head_mask.dim() == 2: head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility else: - head_mask = [None] * self.config.num_hidden_layers + head_mask = [None] * self.config.num_layers all_hidden_states = () all_attentions = () position_bias = None encoder_decoder_position_bias = None - for i, layer_module in enumerate(self.layer): + for i, layer_module in enumerate(self.blocks): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -498,8 +531,9 @@ class T5Stack(nn.Module): head_mask=head_mask[i]) hidden_states = layer_outputs[0] if i == 0: - position_bias = layer_outputs[2] if len(layer_outputs) > 3 else None - encoder_decoder_position_bias = layer_outputs[4] if len(layer_outputs) > 5 else None + position_bias = layer_outputs[2 if self.output_attentions else 1] + if self.is_decoder: + encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2] if self.output_attentions: all_attentions = all_attentions + (layer_outputs[1],) @@ -519,27 +553,6 @@ class T5Stack(nn.Module): return outputs # last-layer hidden state, (all hidden states), (all attentions) -class T5PreTrainedModel(PreTrainedEncoderDecoder): - """ An abstract class to handle weights initialization and - a simple interface for dowloading and loading pretrained models. - """ - config_class = T5Config - pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP - load_tf_weights = load_tf_weights_in_t5 - - def _init_weights(self, module): - """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: - module.bias.data.zero_() - - T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. @@ -620,7 +633,7 @@ class T5Model(T5PreTrainedModel): """ def __init__(self, config): super(T5Model, self).__init__(config) - self.shared = nn.Embeddings(config.vocab_size, config.d_model) + self.shared = nn.Embedding(config.vocab_size, config.d_model) encoder_config = copy.deepcopy(config) self.encoder = T5Stack(encoder_config) @@ -631,7 +644,6 @@ class T5Model(T5PreTrainedModel): self.init_weights() - @property def get_input_embeddings(self): return self.shared @@ -646,17 +658,17 @@ class T5Model(T5PreTrainedModel): for layer, heads in heads_to_prune.items(): self.encoder.layer[layer].attention.prune_heads(heads) - def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): + def forward(self, **kwargs): # keyword arguments come in 3 flavors: encoder-specific (prefixed by # `encoder_`), decoder-specific (prefixed by `decoder_`) and those # that apply to the model as whole. # We let the specific kwargs override the common ones in case of conflict. kwargs_common = dict((k, v) for k, v in kwargs.items() if not k.startswith("encoder_") and not k.startswith("decoder_")) - kwargs_decoder = kwargs_common.copy() kwargs_encoder = kwargs_common.copy() - kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_"))) - kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + kwargs_decoder = kwargs_common.copy() + kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) @@ -680,7 +692,7 @@ class T5Model(T5PreTrainedModel): @add_start_docstrings("""T5 Model with a `language modeling` head on top. """, T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) -class T5WithLMHead(T5PreTrainedModel): +class T5WithLMHeadModel(T5PreTrainedModel): r""" **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: Labels for computing the masked language modeling loss. @@ -704,14 +716,14 @@ class T5WithLMHead(T5PreTrainedModel): Examples:: tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased') - model = T5ForMaskedLM.from_pretrained('t5-base-uncased') + model = T5WithLMHeadModel.from_pretrained('t5-base-uncased') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids, lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ def __init__(self, config): - super(T5ForMaskedLM, self).__init__(config) + super(T5WithLMHeadModel, self).__init__(config) self.transformer = T5Model(config) self.lm_head = nn.Linear(config.d_model, config.vocab_size) @@ -721,11 +733,12 @@ class T5WithLMHead(T5PreTrainedModel): def get_output_embeddings(self): return self.lm_head - def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): - outputs = self.transformer(encoder_input_ids, decoder_input_ids, **kwargs) + def forward(self, **kwargs): + lm_labels = kwargs.pop('decoder_lm_labels', None) + outputs = self.transformer(**kwargs) sequence_output = outputs[0] - lm_logits = self.cls(sequence_output) + lm_logits = self.lm_head(sequence_output) outputs = (lm_logits,) + outputs[2:] # Add hidden states and attention if they are here if lm_labels is not None: diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index ddc0f9f3de..42bf9ac3f5 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -73,6 +73,7 @@ class CommonTestCases: test_pruning = True test_resize_embeddings = True test_head_masking = True + is_encoder_decoder = False def test_save_load(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -114,10 +115,9 @@ class CommonTestCases: for model_class in self.all_model_classes: model = model_class(config) model.eval() - first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0] + first, second = model(**inputs_dict)[0], model(**inputs_dict)[0] self.assertEqual(first.ne(second).sum().item(), 0) - def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -127,31 +127,42 @@ class CommonTestCases: model = model_class(config) model.eval() outputs = model(**inputs_dict) - attentions = outputs[-1] + self_attentions = outputs[-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(attentions[0].shape[-3:]), + list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, self.model_tester.seq_length, self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) out_len = len(outputs) + if self.is_encoder_decoder: + cross_attentions = outputs[-2] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + # Check attention is always last and order is fine config.output_attentions = True config.output_hidden_states = True model = model_class(config) model.eval() outputs = model(**inputs_dict) - self.assertEqual(out_len+1, len(outputs)) + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) - attentions = outputs[-1] - self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + self_attentions = outputs[-1] + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(attentions[0].shape[-3:]), + list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, self.model_tester.seq_length, self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) @@ -214,7 +225,6 @@ class CommonTestCases: self.assertTrue(models_equal) - def test_headmasking(self): if not self.test_head_masking: return @@ -268,7 +278,6 @@ class CommonTestCases: self.assertNotEqual( attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0) - def test_head_pruning(self): if not self.test_pruning: return @@ -411,7 +420,6 @@ class CommonTestCases: self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]}) - def test_hidden_states_output(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py new file mode 100644 index 0000000000..b8bb828ebd --- /dev/null +++ b/transformers/tests/modeling_t5_test.py @@ -0,0 +1,176 @@ +# coding=utf-8 +# Copyright 2018 Google T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import shutil +import pytest + +from transformers import is_torch_available + +from .modeling_common_test import (CommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester + +if is_torch_available(): + from transformers import (T5Config, T5Model, T5WithLMHeadModel) + from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP +else: + pytestmark = pytest.mark.skip("Require Torch") + + +class T5ModelTest(CommonTestCases.CommonModelTester): + + all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else () + test_pruning = False + test_torchscript = False + test_resize_embeddings = False + is_encoder_decoder = True + + class T5ModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_labels=True, + vocab_size=99, + n_positions=14, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + d_ff=37, + relative_attention_num_buckets=8, + dropout_rate=0.1, + initializer_range=0.02, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_labels = use_labels + self.vocab_size = vocab_size + self.n_positions = n_positions + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.d_ff = d_ff + self.relative_attention_num_buckets = relative_attention_num_buckets + self.dropout_rate = dropout_rate + self.initializer_range = initializer_range + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_labels = None + if self.use_labels: + token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + config = T5Config( + vocab_size_or_config_json_file=self.vocab_size, + n_positions=self.n_positions, + d_model=self.hidden_size, + d_ff=self.d_ff, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + relative_attention_num_buckets=self.relative_attention_num_buckets, + dropout_rate=self.dropout_rate, + initializer_range=self.initializer_range) + + return (config, input_ids, input_mask, token_labels) + + def check_loss_output(self, result): + self.parent.assertListEqual( + list(result["loss"].size()), + []) + + def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): + model = T5Model(config=config) + model.eval() + encoder_output, decoder_output = model(encoder_input_ids=input_ids, + decoder_input_ids=input_ids, + decoder_attention_mask=input_mask) + encoder_output, decoder_output = model(encoder_input_ids=input_ids, + decoder_input_ids=input_ids) + + result = { + "encoder_output": encoder_output, + "decoder_output": decoder_output, + } + self.parent.assertListEqual( + list(result["encoder_output"].size()), + [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual( + list(result["decoder_output"].size()), + [self.batch_size, self.seq_length, self.hidden_size]) + + + def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): + model = T5WithLMHeadModel(config=config) + model.eval() + loss, prediction_scores = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, + decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + result = { + "loss": loss, + "prediction_scores": prediction_scores, + } + self.parent.assertListEqual( + list(result["prediction_scores"].size()), + [self.batch_size, self.seq_length, self.vocab_size]) + self.check_loss_output(result) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, input_mask, token_labels) = config_and_inputs + inputs_dict = {'encoder_input_ids': input_ids, + 'decoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} + return config, inputs_dict + + def setUp(self): + self.model_tester = T5ModelTest.T5ModelTester(self) + self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_t5_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_t5_model(*config_and_inputs) + + def test_with_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs) + + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/transformers_test/" + for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: + model = T5Model.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + +if __name__ == "__main__": + unittest.main() diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py new file mode 100644 index 0000000000..fac6763432 --- /dev/null +++ b/transformers/tests/modeling_tf_t5_test.py @@ -0,0 +1,190 @@ +# coding=utf-8 +# Copyright 2018 Google T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +import shutil +import pytest +import sys + +from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) +from .configuration_common_test import ConfigTester + +from transformers import T5Config, is_tf_available + +if False: # is_tf_available(): + import tensorflow as tf + from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) +else: + pytestmark = pytest.mark.skip("Require TensorFlow") + + +class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): + + all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if False else () # is_tf_available() else () + + class TFT5ModelTester(object): + + def __init__(self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=16, + type_sequence_label_size=2, + initializer_range=0.02, + num_labels=3, + num_choices=4, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.initializer_range = initializer_range + self.num_labels = num_labels + self.num_choices = num_choices + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = T5Config( + vocab_size_or_config_json_file=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_size=self.type_vocab_size, + initializer_range=self.initializer_range) + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def create_and_check_t5_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = TFT5Model(config=config) + inputs = {'input_ids': input_ids, + 'attention_mask': input_mask, + 'token_type_ids': token_type_ids} + sequence_output, pooled_output = model(inputs) + + inputs = [input_ids, input_mask] + sequence_output, pooled_output = model(inputs) + + sequence_output, pooled_output = model(input_ids) + + result = { + "sequence_output": sequence_output.numpy(), + "pooled_output": pooled_output.numpy(), + } + self.parent.assertListEqual( + list(result["sequence_output"].shape), + [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) + + + def create_and_check_t5_with_lm_head(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + model = TFT5WithLMHeadModel(config=config) + inputs = {'input_ids': input_ids, + 'attention_mask': input_mask, + 'token_type_ids': token_type_ids} + prediction_scores, = model(inputs) + result = { + "prediction_scores": prediction_scores.numpy(), + } + self.parent.assertListEqual( + list(result["prediction_scores"].shape), + [self.batch_size, self.seq_length, self.vocab_size]) + + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + (config, input_ids, token_type_ids, input_mask, + sequence_labels, token_labels, choice_labels) = config_and_inputs + inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + return config, inputs_dict + + def setUp(self): + self.model_tester = TFT5ModelTest.TFT5ModelTester(self) + self.config_tester = ConfigTester(self, config_class=T5Config, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_t5_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_t5_model(*config_and_inputs) + + def test_with_lm_head(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs) + + @pytest.mark.slow + def test_model_from_pretrained(self): + cache_dir = "/tmp/transformers_test/" + for model_name in ['t5-base']: + model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir) + shutil.rmtree(cache_dir) + self.assertIsNotNone(model) + +if __name__ == "__main__": + unittest.main() diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py new file mode 100644 index 0000000000..9362487d8d --- /dev/null +++ b/transformers/tests/tokenization_t5_test.py @@ -0,0 +1,77 @@ +# coding=utf-8 +# Copyright 2018 Google T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import unittest +import pytest + +from transformers.tokenization_t5 import (T5Tokenizer, SPIECE_UNDERLINE) + +from .tokenization_tests_commons import CommonTestCases + +SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), + 'fixtures/test_sentencepiece.model') + +class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): + + tokenizer_class = T5Tokenizer + + def setUp(self): + super(T5TokenizationTest, self).setUp() + + # We have a SentencePiece fixture for testing + tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer.save_pretrained(self.tmpdirname) + + def get_tokenizer(self, **kwargs): + return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_input_output_texts(self): + input_text = u"This is a test" + output_text = u"This is a test" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True) + + tokens = tokenizer.tokenize(u'This is a test') + self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382]) + + tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.") + self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', + u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'', + u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', + SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.']) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, [8, 21, 84, 55, 24, 19, 7, 0, + 602, 347, 347, 347, 3, 12, 66, + 46, 72, 80, 6, 0, 4]) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b', + u'or', u'n', SPIECE_UNDERLINE + u'in', + SPIECE_UNDERLINE + u'', u'', u'2', u'0', u'0', u'0', u',', + SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this', + SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', + u'', u'.']) + + +if __name__ == '__main__': + unittest.main() From ba10065c4b44d733d135ad6dc1b8a77f88c6dbb9 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 7 Nov 2019 15:55:36 +0100 Subject: [PATCH 014/302] update model, conversion script, tests and template --- ...t_xxx_original_tf_checkpoint_to_pytorch.py | 10 +- transformers/__init__.py | 1 + transformers/configuration_t5.py | 13 +- ...rt_t5_original_tf_checkpoint_to_pytorch.py | 12 +- transformers/modeling_t5.py | 129 ++++++++++++------ transformers/tests/modeling_common_test.py | 41 +++--- transformers/tests/modeling_t5_test.py | 12 +- transformers/tokenization_t5.py | 1 + 8 files changed, 135 insertions(+), 84 deletions(-) diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py index d50d129cba..9d389deaad 100755 --- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py +++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py @@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx import logging logging.basicConfig(level=logging.INFO) -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path): +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model - config = XxxConfig.from_json_file(xxx_config_file) + config = XxxConfig.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) model = XxxForPreTraining(config) @@ -48,11 +48,11 @@ if __name__ == "__main__": type = str, required = True, help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--xxx_config_file", + parser.add_argument("--config_file", default = None, type = str, required = True, - help = "The config json file corresponding to the pre-trained XXX model. \n" + help = "The config json file corresponding to the pre-trained model. \n" "This specifies the model architecture.") parser.add_argument("--pytorch_dump_path", default = None, @@ -61,5 +61,5 @@ if __name__ == "__main__": help = "Path to the output PyTorch model.") args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.xxx_config_file, + args.config_file, args.pytorch_dump_path) diff --git a/transformers/__init__.py b/transformers/__init__.py index bf896276d6..601a068592 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -97,6 +97,7 @@ if is_torch_available(): DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel, + load_tf_weights_in_t5, T5_PRETRAINED_MODEL_ARCHIVE_MAP) # Optimization diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 9db918e59f..96e67758ac 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -57,8 +57,7 @@ class T5Config(PretrainedConfig): (e.g., 512 or 1024 or 2048). type_vocab_size: The vocabulary size of the `token_type_ids` passed into `T5Model`. - initializer_range: The sttdev of the truncated_normal_initializer for - initializing all weight matrices. + initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing). layer_norm_eps: The epsilon used by LayerNorm. """ pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -67,25 +66,27 @@ class T5Config(PretrainedConfig): vocab_size_or_config_json_file=32128, n_positions=512, d_model=512, + d_kv=64, d_ff=2048, - num_layers=12, - num_heads=12, + num_layers=6, + num_heads=8, relative_attention_num_buckets=32, dropout_rate=0.1, layer_norm_epsilon=1e-6, - initializer_range=0.02, + initializer_factor=1.0, **kwargs): super(T5Config, self).__init__(**kwargs) self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 self.n_positions = n_positions self.d_model = d_model + self.d_kv = d_kv self.d_ff = d_ff self.num_layers = num_layers self.num_heads = num_heads self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range + self.initializer_factor = initializer_factor if isinstance(vocab_size_or_config_json_file, six.string_types): with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py index 608027ebac..2b74d2dd93 100755 --- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py +++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py @@ -21,16 +21,16 @@ from __future__ import print_function import argparse import torch -from transformers import T5Config, T5ForPreTraining, load_tf_weights_in_t5 +from transformers import T5Config, T5Model, load_tf_weights_in_t5 import logging logging.basicConfig(level=logging.INFO) -def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, t5_config_file, pytorch_dump_path): +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): # Initialise PyTorch model - config = T5Config.from_json_file(t5_config_file) + config = T5Config.from_json_file(config_file) print("Building PyTorch model from configuration: {}".format(str(config))) - model = T5ForPreTraining(config) + model = T5Model(config) # Load weights from tf checkpoint load_tf_weights_in_t5(model, config, tf_checkpoint_path) @@ -48,7 +48,7 @@ if __name__ == "__main__": type = str, required = True, help = "Path to the TensorFlow checkpoint path.") - parser.add_argument("--t5_config_file", + parser.add_argument("--config_file", default = None, type = str, required = True, @@ -61,5 +61,5 @@ if __name__ == "__main__": help = "Path to the output PyTorch model.") args = parser.parse_args() convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, - args.t5_config_file, + args.config_file, args.pytorch_dump_path) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index ce443cf882..6ed241761a 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -65,34 +65,40 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): # Load weights from TF model init_vars = tf.train.list_variables(tf_path) names = [] - arrays = [] + tf_weights = {} for name, shape in init_vars: logger.info("Loading TF weight {} with shape {}".format(name, shape)) array = tf.train.load_variable(tf_path, name) names.append(name) - arrays.append(array) + tf_weights[name] = array - for name, array in zip(names, arrays): - name = name.split('/') + for txt_name in names: + name = txt_name.split('/') # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v # which are not required for using pretrained model if any(n in ["adam_v", "adam_m", "global_step"] for n in name): logger.info("Skipping {}".format("/".join(name))) + tf_weights.pop(txt_name, None) + continue + if '_slot_' in name[-1]: + logger.info("Skipping {}".format("/".join(name))) + tf_weights.pop(txt_name, None) continue pointer = model + array = tf_weights[txt_name] for m_name in name: if re.fullmatch(r'[A-Za-z]+_\d+', m_name): l = re.split(r'_(\d+)', m_name) else: l = [m_name] - if l[0] == 'kernel' or l[0] == 'gamma': + if l[0] in ['kernel', 'scale', 'embedding']: pointer = getattr(pointer, 'weight') - elif l[0] == 'output_bias' or l[0] == 'beta': - pointer = getattr(pointer, 'bias') - elif l[0] == 'output_weights': - pointer = getattr(pointer, 'weight') - elif l[0] == 'squad': - pointer = getattr(pointer, 'classifier') + # elif l[0] == 'scale': + # pointer = getattr(pointer, 'weight') + # elif l[0] == 'output_bias' or l[0] == 'beta': + # pointer = getattr(pointer, 'bias') + # elif l[0] == 'squad': + # pointer = getattr(pointer, 'classifier') else: try: pointer = getattr(pointer, l[0]) @@ -102,9 +108,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): if len(l) >= 2: num = int(l[1]) pointer = pointer[num] - if m_name[-11:] == '_embeddings': + if l[0] not in ['kernel', 'scale', 'embedding']: pointer = getattr(pointer, 'weight') - elif m_name == 'kernel': + if l[0] != 'embedding': + logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name)) array = np.transpose(array) try: assert pointer.shape == array.shape @@ -112,7 +119,11 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): e.args += (pointer.shape, array.shape) raise logger.info("Initialize PyTorch weight {}".format(name)) - pointer.data = torch.from_numpy(array) + pointer.data = torch.from_numpy(array.astype(np.float32)) + tf_weights.pop(txt_name, None) + + logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) + # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys()))) return model @@ -163,10 +174,13 @@ class T5Attention(nn.Module): self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets self.dim = config.d_model + self.d_kv = config.d_kv self.n_heads = config.num_heads self.dropout = config.dropout_rate assert self.dim % self.n_heads == 0 + assert self.dim // self.n_heads == self.d_kv + # Mesh TensorFlow initialization to avoid scaling before softmax self.q = nn.Linear(self.dim, self.dim, bias=False) self.k = nn.Linear(self.dim, self.dim, bias=False) self.v = nn.Linear(self.dim, self.dim, bias=False) @@ -312,8 +326,9 @@ class T5Attention(nn.Module): scores += position_bias if mask is not None: - mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) - scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + scores += mask + # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) @@ -378,34 +393,35 @@ class T5Block(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5Block, self).__init__() self.is_decoder = config.is_decoder - self.layer_000 = T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias) + self.layer = nn.ModuleList() + self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)) if self.is_decoder: - self.layer_001 = T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_002 = T5LayerFF(config) + self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)) + self.layer.append(T5LayerFF(config)) else: - self.layer_001 = T5LayerFF(config) + self.layer.append(T5LayerFF(config)) def forward(self, hidden_states, attention_mask=None, position_bias=None, encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, head_mask=None): - self_attention_outputs = self.layer_000(hidden_states, + self_attention_outputs = self.layer[0](hidden_states, attention_mask=attention_mask, position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] outputs = self_attention_outputs[1:] - if self.is_decoder: - cross_attention_outputs = self.layer_001(hidden_states, - kv=encoder_hidden_states, - attention_mask=encoder_attention_mask, - position_bias=encoder_decoder_position_bias, - head_mask=head_mask) + if not self.is_decoder: + hidden_states = self.layer[1](hidden_states) + else: + cross_attention_outputs = self.layer[1](hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + head_mask=head_mask) hidden_states = cross_attention_outputs[0] outputs = cross_attention_outputs[1:] + outputs - hidden_states = self.layer_002(hidden_states) - else: - hidden_states = self.layer_001(hidden_states) + hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them return outputs @@ -422,15 +438,36 @@ class T5PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ - if isinstance(module, (nn.Linear, nn.Embedding)): - # Slightly different from the TF version which uses truncated_normal for initialization - # cf https://github.com/pytorch/pytorch/pull/5617 - module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) - elif isinstance(module, nn.LayerNorm): - module.bias.data.zero_() - module.weight.data.fill_(1.0) - if isinstance(module, nn.Linear) and module.bias is not None: + factor = self.config.initializer_factor # Used for testing weights initialization + if isinstance(module, nn.LayerNorm): module.bias.data.zero_() + module.weight.data.fill_(factor*1.0) + elif isinstance(module, T5Model): + # Mesh TensorFlow embeddings initialization + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 + module.shared.weight.data.normal_(mean=0.0, std=factor*1.0) + elif isinstance(module, T5DenseReluDense): + # Mesh TensorFlow FF initialization + # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56 + # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89 + module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5)) + if hasattr(module.wi, 'bias') and module.wi.bias is not None: + module.wi.bias.data.zero_() + module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5)) + if hasattr(module.wo, 'bias') and module.wo.bias is not None: + module.wo.bias.data.zero_() + elif isinstance(module, T5Attention): + # Mesh TensorFlow attention initialization to avoid scaling before softmax + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136 + d_model = self.config.d_model + d_kv = self.config.d_kv + n_heads = self.config.num_heads + module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5)) + module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) + module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5)) + module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5)) + if module.has_relative_attention_bias: + module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5)) class T5Stack(T5PreTrainedModel): @@ -440,8 +477,8 @@ class T5Stack(T5PreTrainedModel): self.output_hidden_states = config.output_hidden_states self.is_decoder = config.is_decoder - self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) - for i in range(config.num_layers)]) + self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) + for i in range(config.num_layers)]) self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) @@ -518,7 +555,7 @@ class T5Stack(T5PreTrainedModel): all_attentions = () position_bias = None encoder_decoder_position_bias = None - for i, layer_module in enumerate(self.blocks): + for i, layer_module in enumerate(self.block): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -724,9 +761,10 @@ class T5WithLMHeadModel(T5PreTrainedModel): """ def __init__(self, config): super(T5WithLMHeadModel, self).__init__(config) + self.model_dim = config.d_model self.transformer = T5Model(config) - self.lm_head = nn.Linear(config.d_model, config.vocab_size) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.init_weights() @@ -738,15 +776,18 @@ class T5WithLMHeadModel(T5PreTrainedModel): outputs = self.transformer(**kwargs) sequence_output = outputs[0] + # Rescale output before projecting on vocab + # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 + sequence_output = sequence_output * (self.model_dim ** -0.5) lm_logits = self.lm_head(sequence_output) - outputs = (lm_logits,) + outputs[2:] # Add hidden states and attention if they are here + outputs = (lm_logits,) + outputs[1:] # Add hidden states and attention if they are here if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - outputs = (loss,) + outputs + outputs = (loss,) + outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 42bf9ac3f5..ee75da605c 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -59,7 +59,7 @@ else: def _config_zero_init(config): configs_no_init = copy.deepcopy(config) for key in configs_no_init.__dict__.keys(): - if '_range' in key or '_std' in key: + if '_range' in key or '_std' in key or 'initializer_factor' in key: setattr(configs_no_init, key, 0.0) return configs_no_init @@ -83,20 +83,24 @@ class CommonTestCases: model.eval() with torch.no_grad(): outputs = model(**inputs_dict) + out_2 = outputs[0].numpy() + out_2[np.isnan(out_2)] = 0 with TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) model = model_class.from_pretrained(tmpdirname) - with torch.no_grad(): - after_outputs = model(**inputs_dict) - # Make sure we don't have nans - out_1 = after_outputs[0].numpy() - out_2 = outputs[0].numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] - max_diff = np.amax(np.abs(out_1 - out_2)) - self.assertLessEqual(max_diff, 1e-5) + with torch.no_grad(): + after_outputs = model(**inputs_dict) + + # # Make sure we don't have nans + out_1 = after_outputs[0].numpy() + out_1[np.isnan(out_1)] = 0 + + out_1 = out_1 - out_2 + amax = np.amax(out_1) + amin = np.amin(out_1) + self.assertLessEqual(max(amax, -amin), 1e-5) def test_initialization(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -127,27 +131,28 @@ class CommonTestCases: model = model_class(config) model.eval() outputs = model(**inputs_dict) - self_attentions = outputs[-1] + attentions = outputs[-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(self_attentions[0].shape[-3:]), + list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, self.model_tester.seq_length, self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) out_len = len(outputs) if self.is_encoder_decoder: - cross_attentions = outputs[-2] + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs[(out_len // 2)-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) - self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) self.assertListEqual( - list(cross_attentions[0].shape[-3:]), + list(decoder_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) # Check attention is always last and order is fine config.output_attentions = True diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index b8bb828ebd..2c67b83c25 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -57,7 +57,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): d_ff=37, relative_attention_num_buckets=8, dropout_rate=0.1, - initializer_range=0.02, + initializer_factor=0.002, scope=None, ): self.parent = parent @@ -74,7 +74,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): self.d_ff = d_ff self.relative_attention_num_buckets = relative_attention_num_buckets self.dropout_rate = dropout_rate - self.initializer_range = initializer_range + self.initializer_factor = initializer_factor self.scope = scope def prepare_config_and_inputs(self): @@ -93,11 +93,12 @@ class T5ModelTest(CommonTestCases.CommonModelTester): n_positions=self.n_positions, d_model=self.hidden_size, d_ff=self.d_ff, + d_kv=self.hidden_size // self.num_attention_heads, num_layers=self.num_hidden_layers, num_heads=self.num_attention_heads, relative_attention_num_buckets=self.relative_attention_num_buckets, dropout_rate=self.dropout_rate, - initializer_range=self.initializer_range) + initializer_factor=self.initializer_factor) return (config, input_ids, input_mask, token_labels) @@ -130,8 +131,9 @@ class T5ModelTest(CommonTestCases.CommonModelTester): def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): model = T5WithLMHeadModel(config=config) model.eval() - loss, prediction_scores = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, - decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + outputs = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, + decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + loss, prediction_scores = outputs[0], outputs[1] result = { "loss": loss, "prediction_scores": prediction_scores, diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index cff6a41baf..ae898ba0d3 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging import os +from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer From 8fda532c3cbab9e31fbbfa860f232b69e0f80633 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 7 Nov 2019 17:09:50 +0100 Subject: [PATCH 015/302] fix python 2 sentencepiece tokenization --- transformers/tests/tokenization_t5_test.py | 7 +++--- transformers/tokenization_t5.py | 26 ++++++++++++++++++---- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py index 9362487d8d..aabb21e443 100644 --- a/transformers/tests/tokenization_t5_test.py +++ b/transformers/tests/tokenization_t5_test.py @@ -18,7 +18,8 @@ import os import unittest import pytest -from transformers.tokenization_t5 import (T5Tokenizer, SPIECE_UNDERLINE) +from transformers.tokenization_t5 import (T5Tokenizer) +from transformers.tokenization_xlnet import SPIECE_UNDERLINE from .tokenization_tests_commons import CommonTestCases @@ -33,7 +34,7 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): super(T5TokenizationTest, self).setUp() # We have a SentencePiece fixture for testing - tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer = T5Tokenizer(SAMPLE_VOCAB) tokenizer.save_pretrained(self.tmpdirname) def get_tokenizer(self, **kwargs): @@ -45,7 +46,7 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester): return input_text, output_text def test_full_tokenizer(self): - tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer = T5Tokenizer(SAMPLE_VOCAB) tokens = tokenizer.tokenize(u'This is a test') self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est']) diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index ae898ba0d3..93842d29f0 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging import os +import six from shutil import copyfile from .tokenization_utils import PreTrainedTokenizer @@ -96,18 +97,35 @@ class T5Tokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(self.vocab_file) - def _tokenize(self, text): + def _tokenize(self, text, return_unicode=True, sample=False): """ Take as input a string and return a list of strings (tokens) for words/sub-words """ - return self.sp_model.EncodeAsPieces(text) + if not sample: + pieces = self.sp_model.EncodeAsPieces(text) + else: + pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + + # convert back to unicode for py2 + if six.PY2 and return_unicode: + ret_pieces = [] + for piece in pieces: + if isinstance(piece, str): + piece = piece.decode('utf-8') + ret_pieces.append(piece) + pieces = ret_pieces + + return pieces def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ return self.sp_model.piece_to_id(token) - def _convert_id_to_token(self, index): + def _convert_id_to_token(self, index, return_unicode=True): """Converts an index (integer) in a token (string/unicode) using the vocab.""" - return self.sp_model.id_to_piece(index) + token = self.sp_model.IdToPiece(index) + if six.PY2 and return_unicode and isinstance(token, str): + token = token.decode('utf-8') + return token def convert_tokens_to_string(self, tokens): """ Converts a sequence of tokens (string) in a single string. """ From 727a79b305364522b6853679c5523efd9de7f772 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 11:35:03 +0100 Subject: [PATCH 016/302] added TF2 model and tests - updated templates --- .../adding_a_new_model/modeling_tf_xxx.py | 2 + templates/adding_a_new_model/modeling_xxx.py | 2 + transformers/__init__.py | 3 + transformers/configuration_auto.py | 6 +- transformers/configuration_t5.py | 3 +- transformers/modeling_t5.py | 79 +- transformers/modeling_tf_pytorch_utils.py | 4 +- transformers/modeling_tf_t5.py | 783 +++++++++++------- transformers/modeling_utils.py | 6 +- transformers/tests/modeling_tf_common_test.py | 23 +- transformers/tests/modeling_tf_t5_test.py | 116 ++- 11 files changed, 646 insertions(+), 381 deletions(-) diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py index c661975768..b58817e453 100644 --- a/templates/adding_a_new_model/modeling_tf_xxx.py +++ b/templates/adding_a_new_model/modeling_tf_xxx.py @@ -26,6 +26,8 @@ import logging import math import os import sys +import copy +import itertools from io import open import numpy as np diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py index ee705e753c..9c3505f0cf 100644 --- a/templates/adding_a_new_model/modeling_xxx.py +++ b/templates/adding_a_new_model/modeling_xxx.py @@ -25,6 +25,8 @@ import logging import math import os import sys +import copy +import itertools from io import open import torch diff --git a/transformers/__init__.py b/transformers/__init__.py index 601a068592..b882f4d968 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -158,6 +158,9 @@ if is_tf_available(): TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel, + TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) + # TF 2.0 <=> PyTorch conversion utilities from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name, load_pytorch_checkpoint_in_tf2_model, diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index edd21a670c..3bee5b84a1 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -27,6 +27,7 @@ from .configuration_xlm import XLMConfig from .configuration_roberta import RobertaConfig from .configuration_distilbert import DistilBertConfig from .configuration_ctrl import CTRLConfig +from .configuration_t5 import T5Config logger = logging.getLogger(__name__) @@ -64,6 +65,7 @@ class AutoConfig(object): The configuration class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5Config (T5 model) - contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `bert`: BertConfig (Bert model) - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) @@ -114,7 +116,9 @@ class AutoConfig(object): assert unused_kwargs == {'foo': False} """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 96e67758ac..83aab66fac 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -27,8 +27,7 @@ from .configuration_utils import PretrainedConfig logger = logging.getLogger(__name__) T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-config.json", - 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-config.json", + 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", } diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 6ed241761a..6be0ae6863 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -41,8 +41,7 @@ logger = logging.getLogger(__name__) # for the pretrained weights provided with the models #################################################### T5_PRETRAINED_MODEL_ARCHIVE_MAP = { - 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-pytorch_model.bin", - 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-pytorch_model.bin", + 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", } #################################################### @@ -442,7 +441,7 @@ class T5PreTrainedModel(PreTrainedModel): if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(factor*1.0) - elif isinstance(module, T5Model): + elif isinstance(module, (T5Model, T5WithLMHeadModel)): # Mesh TensorFlow embeddings initialization # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624 module.shared.weight.data.normal_(mean=0.0, std=factor*1.0) @@ -502,11 +501,10 @@ class T5Stack(T5PreTrainedModel): # ourselves in which case we just need to make it broadcastable to all heads. if attention_mask.dim() == 3: extended_attention_mask = attention_mask[:, None, :, :] - + elif attention_mask.dim() == 2: # Provided a padding mask of dimensions [batch_size, seq_length] # - if the model is a decoder, apply a causal mask in addition to the padding mask # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] - if attention_mask.dim() == 2: if self.config.is_decoder: seq_ids = torch.arange(seq_length, device=hidden_states.device) causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] @@ -593,7 +591,7 @@ class T5Stack(T5PreTrainedModel): T5_START_DOCSTRING = r""" The T5 model was proposed in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. - It's an encoder decoder pre-trained transformer. + It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. @@ -634,16 +632,13 @@ T5_INPUTS_DOCSTRING = r""" Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare single stack (encoder or decoder) of a T5 Model transformer outputting raw hidden-states" +@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states" "without any specific head on top.", T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) class T5Model(T5PreTrainedModel): @@ -661,8 +656,8 @@ class T5Model(T5PreTrainedModel): Examples:: - tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased') - model = T5Model.from_pretrained('t5-base-uncased') + tokenizer = T5Tokenizer.from_pretrained('t5-small') + model = T5Model.from_pretrained('t5-small') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple @@ -752,8 +747,8 @@ class T5WithLMHeadModel(T5PreTrainedModel): Examples:: - tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased') - model = T5WithLMHeadModel.from_pretrained('t5-base-uncased') + tokenizer = T5Tokenizer.from_pretrained('t5-small') + model = T5WithLMHeadModel.from_pretrained('t5-small') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 outputs = model(input_ids, lm_labels=input_ids) loss, prediction_scores = outputs[:2] @@ -763,31 +758,73 @@ class T5WithLMHeadModel(T5PreTrainedModel): super(T5WithLMHeadModel, self).__init__(config) self.model_dim = config.d_model - self.transformer = T5Model(config) + self.shared = nn.Embedding(config.vocab_size, config.d_model) + + encoder_config = copy.deepcopy(config) + self.encoder = T5Stack(encoder_config) + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + self.decoder = T5Stack(decoder_config) + self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False) self.init_weights() + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + def get_output_embeddings(self): return self.lm_head def forward(self, **kwargs): - lm_labels = kwargs.pop('decoder_lm_labels', None) - outputs = self.transformer(**kwargs) + # keyword arguments come in 3 flavors: encoder-specific (prefixed by + # `encoder_`), decoder-specific (prefixed by `decoder_`) and those + # that apply to the model as whole. + # We let the specific kwargs override the common ones in case of conflict. - sequence_output = outputs[0] + lm_labels = kwargs.pop('decoder_lm_labels', None) + + kwargs_common = dict((k, v) for k, v in kwargs.items() + if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_encoder = kwargs_common.copy() + kwargs_decoder = kwargs_common.copy() + kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] + else: + encoder_outputs = () + + # Decode + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) + decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) + + sequence_output = decoder_outputs[0] # Rescale output before projecting on vocab # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586 sequence_output = sequence_output * (self.model_dim ** -0.5) lm_logits = self.lm_head(sequence_output) - outputs = (lm_logits,) + outputs[1:] # Add hidden states and attention if they are here + decoder_outputs = (lm_logits,) + decoder_outputs[1:] # Add hidden states and attention if they are here if lm_labels is not None: shift_logits = lm_logits[..., :-1, :].contiguous() shift_labels = lm_labels[..., 1:].contiguous() loss_fct = CrossEntropyLoss(ignore_index=-1) loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) - outputs = (loss,) + outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 + decoder_outputs = (loss,) + decoder_outputs # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666 - return outputs # (lm_loss), lm_logits, (hidden_states), (attentions) + return decoder_outputs + encoder_outputs diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index 88ce4d4610..6330c2748c 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -156,7 +156,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a e.args += (symbolic_weight.shape, array.shape) raise e - logger.info("Initialize TF weight {}".format(symbolic_weight.name)) + # logger.warning("Initialize TF weight {}".format(symbolic_weight.name)) weight_value_tuples.append((symbolic_weight, array)) all_pytorch_weights.discard(name) @@ -269,7 +269,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F e.args += (pt_weight.shape, array.shape) raise e - logger.info("Initialize PyTorch weight {}".format(pt_weight_name)) + # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name)) new_pt_params_dict[pt_weight_name] = torch.from_numpy(array) loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array) diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index deb453846c..c1de4745c2 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -22,24 +22,21 @@ import logging import math import os import sys +import copy +import itertools from io import open import numpy as np import tensorflow as tf from .configuration_t5 import T5Config -from .modeling_tf_utils import TFPreTrainedModel, get_initializer +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer, DUMMY_INPUTS from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) -#################################################### -# This dict contrains shortcut names and associated url -# for the pretrained weights provided with the models -#################################################### TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { - 't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-tf_model.h5", - 't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-tf_model.h5", + 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", } #################################################### @@ -48,33 +45,294 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) #################################################### -#################################################### -# Here is an example of typical layer in a TF 2.0 model of the library -# The classes are usually identical to the PyTorch ones and prefixed with 'TF'. -# -# Note that class __init__ parameters includes **kwargs (send to 'super'). -# This let us have a control on class scope and variable names: -# More precisely, we set the names of the class attributes (lower level layers) to -# to the equivalent attributes names in the PyTorch model so we can have equivalent -# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other. -# -# See the conversion methods in modeling_tf_pytorch_utils.py for more details -#################################################### -class TFT5Layer(tf.keras.layers.Layer): +class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): - super(TFT5Layer, self).__init__(**kwargs) - self.attention = TFT5Attention(config, name='attention') - self.intermediate = TFT5Intermediate(config, name='intermediate') - self.transformer_output = TFT5Output(config, name='output') + super(TFT5DenseReluDense, self).__init__(**kwargs) + self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi') + self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + self.act = tf.keras.activations.relu - def call(self, inputs, training=False): - hidden_states, attention_mask, head_mask = inputs + def call(self, hidden_states, training=False): + h = self.wi(hidden_states) + h = self.act(h) + h = self.dropout(h, training=training) + h = self.wo(h) + return h - attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training) - attention_output = attention_outputs[0] - intermediate_output = self.intermediate(attention_output) - layer_output = self.transformer_output([intermediate_output, attention_output], training=training) - outputs = (layer_output,) + attention_outputs[1:] # add attentions if we output them + +class TFT5LayerFF(tf.keras.layers.Layer): + def __init__(self, config, **kwargs): + super(TFT5LayerFF, self).__init__(**kwargs) + self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, + name='layer_norm') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + + def call(self, hidden_states, training=False): + norm_x = self.layer_norm(hidden_states) + y = self.DenseReluDense(norm_x, training=training) + layer_output = hidden_states + self.dropout(y, training=training) + return layer_output + + +class TFT5Attention(tf.keras.layers.Layer): + NEW_ID = itertools.count() + + def __init__(self, config, has_relative_attention_bias=False, **kwargs): + super(TFT5Attention, self).__init__(**kwargs) + self.layer_id = next(TFT5Attention.NEW_ID) + self.is_decoder = config.is_decoder + self.has_relative_attention_bias = has_relative_attention_bias + + self.output_attentions = config.output_attentions + self.relative_attention_num_buckets = config.relative_attention_num_buckets + self.dim = config.d_model + self.d_kv = config.d_kv + self.n_heads = config.num_heads + assert self.dim % self.n_heads == 0 + assert self.dim // self.n_heads == self.d_kv + + # Mesh TensorFlow initialization to avoid scaling before softmax + self.q = tf.keras.layers.Dense(self.dim, use_bias=False, name='q') + self.k = tf.keras.layers.Dense(self.dim, use_bias=False, name='k') + self.v = tf.keras.layers.Dense(self.dim, use_bias=False, name='v') + self.o = tf.keras.layers.Dense(self.dim, use_bias=False, name='o') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + + if self.has_relative_attention_bias: + self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets, + self.n_heads, + name='relative_attention_bias') + self.pruned_heads = set() + + def prune_heads(self, heads): + raise NotImplementedError + + @staticmethod + def _relative_position_bucket(relative_position, + bidirectional=True, + num_buckets=32, + max_distance=128): + """ + Adapted from Mesh Tensorflow: + https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593 + + Translate relative position to a bucket number for relative attention. + The relative position is defined as memory_position - query_position, i.e. + the distance in tokens from the attending position to the attended-to + position. If bidirectional=False, then positive relative positions are + invalid. + We use smaller buckets for small absolute relative_position and larger buckets + for larger absolute relative_positions. All relative positions >=max_distance + map to the same bucket. All relative positions <=-max_distance map to the + same bucket. This should allow for more graceful generalization to longer + sequences than the model has been trained on. + Args: + relative_position: an int32 Tensor + bidirectional: a boolean - whether the attention is bidirectional + num_buckets: an integer + max_distance: an integer + Returns: + a Tensor with the same shape as relative_position, containing int32 + values in the range [0, num_buckets) + """ + ret = 0 + n = -relative_position + if bidirectional: + num_buckets //= 2 + ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets + n = tf.math.abs(n) + else: + n = tf.math.maximum(n, 0) + # now n is in the range [0, inf) + max_exact = num_buckets // 2 + is_small = tf.math.less(n, max_exact) + val_if_large = max_exact + tf.dtypes.cast( + tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact) + / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32) + val_if_large = tf.math.minimum(val_if_large, num_buckets - 1) + ret += tf.where(is_small, n, val_if_large) + return ret + + def compute_bias(self, qlen, klen): + """ Compute binned relative position bias """ + context_position = tf.range(qlen)[:, None] + memory_position = tf.range(klen)[None, :] + relative_position = memory_position - context_position # shape (qlen, klen) + rp_bucket = self._relative_position_bucket(relative_position, + bidirectional=not self.is_decoder, + num_buckets=self.relative_attention_num_buckets) + values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) + values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen) + return values + + def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False): + """ + Self-attention (if kv is None) or attention over source sentence (provided by kv). + """ + # Input is (bs, qlen, dim) + # Mask is (bs, klen) (non-causal) or (bs, klen, klen) + bs, qlen, dim = shape_list(input) + if kv is None: + klen = qlen if cache is None else cache['slen'] + qlen + else: + klen = shape_list(kv)[1] + # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) + n_heads = self.n_heads + dim_per_head = self.dim // n_heads + + def shape(x): + """ projection """ + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + + def unshape(x): + """ compute context """ + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + + q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) + if kv is None: + k = shape(self.k(input)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(input)) # (bs, n_heads, qlen, dim_per_head) + elif cache is None or self.layer_id not in cache: + k = v = kv + k = shape(self.k(k)) # (bs, n_heads, qlen, dim_per_head) + v = shape(self.v(v)) # (bs, n_heads, qlen, dim_per_head) + + if cache is not None: + if self.layer_id in cache: + if kv is None: + k_, v_ = cache[self.layer_id] + k = tf.concat([k_, k], axis=2) # (bs, n_heads, klen, dim_per_head) + v = tf.concat([v_, v], axis=2) # (bs, n_heads, klen, dim_per_head) + else: + k, v = cache[self.layer_id] + cache[self.layer_id] = (k, v) + + # q = q / math.sqrt(dim_per_head) # No scaling in T5 + scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + + if position_bias is None: + if not self.has_relative_attention_bias: + raise ValueError("No position_bias provided and no weights to compute position_bias") + position_bias = self.compute_bias(qlen, klen) + scores += position_bias + + if mask is not None: + scores += mask + # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + + weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) + weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) + + # Mask heads if we want to + if head_mask is not None: + weights = weights * head_mask + + context = tf.matmul(weights, v) # (bs, n_heads, qlen, dim_per_head) + context = unshape(context) # (bs, qlen, dim) + + context = self.o(context) + + outputs = (context,) + if self.output_attentions: + outputs = outputs + (weights,) + if self.has_relative_attention_bias: + outputs = outputs + (position_bias,) + return outputs + + +class TFT5LayerSelfAttention(tf.keras.layers.Layer): + def __init__(self, config, has_relative_attention_bias=False, **kwargs): + super(TFT5LayerSelfAttention, self).__init__(**kwargs) + self.SelfAttention = TFT5Attention(config, + has_relative_attention_bias=has_relative_attention_bias, + name='SelfAttention') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, + name='layer_norm') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + + def call(self, hidden_states, attention_mask=None, position_bias=None, + head_mask=None, training=False): + norm_x = self.layer_norm(hidden_states) + attention_output = self.SelfAttention(norm_x, + mask=attention_mask, + position_bias=position_bias, + head_mask=head_mask, + training=training) + y = attention_output[0] + layer_output = hidden_states + self.dropout(y, training=training) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class TFT5LayerCrossAttention(tf.keras.layers.Layer): + def __init__(self, config, has_relative_attention_bias=False, **kwargs): + super(TFT5LayerCrossAttention, self).__init__(**kwargs) + self.EncDecAttention = TFT5Attention(config, + has_relative_attention_bias=has_relative_attention_bias, + name='EncDecAttention') + self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, + name='layer_norm') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) + + def call(self, hidden_states, kv, attention_mask=None, position_bias=None, + head_mask=None, training=False): + norm_x = self.layer_norm(hidden_states) + attention_output = self.EncDecAttention(norm_x, + mask=attention_mask, + kv=kv, + position_bias=position_bias, + head_mask=head_mask, + training=training) + y = attention_output[0] + layer_output = hidden_states + self.dropout(y, training=training) + outputs = (layer_output,) + attention_output[1:] # add attentions if we output them + return outputs + + +class TFT5Block(tf.keras.layers.Layer): + def __init__(self, config, has_relative_attention_bias=False, **kwargs): + super(TFT5Block, self).__init__(**kwargs) + self.is_decoder = config.is_decoder + self.layer = [] + self.layer.append(TFT5LayerSelfAttention(config, + has_relative_attention_bias=has_relative_attention_bias, + name='layer_._0')) + if self.is_decoder: + self.layer.append(TFT5LayerCrossAttention(config, + has_relative_attention_bias=has_relative_attention_bias, + name='layer_._1')) + self.layer.append(TFT5LayerFF(config, name='layer_._2')) + else: + self.layer.append(TFT5LayerFF(config, name='layer_._1')) + + def call(self, hidden_states, attention_mask=None, position_bias=None, + encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None, + head_mask=None, training=False): + self_attention_outputs = self.layer[0](hidden_states, + attention_mask=attention_mask, + position_bias=position_bias, + head_mask=head_mask, + training=training) + hidden_states = self_attention_outputs[0] + outputs = self_attention_outputs[1:] + + if not self.is_decoder: + hidden_states = self.layer[1](hidden_states, training=training) + else: + cross_attention_outputs = self.layer[1](hidden_states, + kv=encoder_hidden_states, + attention_mask=encoder_attention_mask, + position_bias=encoder_decoder_position_bias, + head_mask=head_mask, + training=training) + hidden_states = cross_attention_outputs[0] + outputs = cross_attention_outputs[1:] + outputs + hidden_states = self.layer[2](hidden_states, training=training) + + outputs = (hidden_states,) + outputs # add attentions if we output them return outputs @@ -85,6 +343,19 @@ class TFT5Layer(tf.keras.layers.Layer): class TFT5MainLayer(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFT5MainLayer, self).__init__(**kwargs) + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.is_decoder = config.is_decoder + self.config = config + self.num_hidden_layers = config.num_layers + + self.block = [TFT5Block(config, + has_relative_attention_bias=bool(i == 0), + name='block_._{}'.format(i)) + for i in range(config.num_layers)] + self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, + name='final_layer_norm') + self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def _resize_token_embeddings(self, new_num_tokens): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models @@ -92,51 +363,56 @@ class TFT5MainLayer(tf.keras.layers.Layer): def _prune_heads(self, heads_to_prune): raise NotImplementedError # Not implemented yet in the library fr TF 2.0 models - def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False): - # We allow three types of multi-inputs: - # - traditional keyword arguments in the call method - # - all the arguments provided as a dict in the first positional argument of call - # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call - # The last two options are useful to use the tf.keras fit() method. - - if isinstance(inputs, (tuple, list)): - input_ids = inputs[0] - attention_mask = inputs[1] if len(inputs) > 1 else attention_mask - token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids - position_ids = inputs[3] if len(inputs) > 3 else position_ids - head_mask = inputs[4] if len(inputs) > 4 else head_mask - assert len(inputs) <= 5, "Too many inputs." - elif isinstance(inputs, dict): - input_ids = inputs.get('input_ids') - attention_mask = inputs.get('attention_mask', attention_mask) - token_type_ids = inputs.get('token_type_ids', token_type_ids) - position_ids = inputs.get('position_ids', position_ids) - head_mask = inputs.get('head_mask', head_mask) - assert len(inputs) <= 5, "Too many inputs." - else: - input_ids = inputs + def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None, + encoder_attention_mask=None, head_mask=None, training=False): + batch_size, seq_length = shape_list(hidden_states)[:2] if attention_mask is None: - attention_mask = tf.fill(tf.shape(input_ids), 1) - if token_type_ids is None: - token_type_ids = tf.fill(tf.shape(input_ids), 0) + attention_mask = tf.fill((batch_size, seq_length), 1) + if self.is_decoder and encoder_attention_mask is None: + encoder_seq_length = encoder_hidden_states.shape[1] + encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1) - # We create a 3D attention mask from a 2D tensor mask. - # Sizes are [batch_size, 1, 1, to_seq_length] - # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] - # this attention mask is more simple than the triangular masking of causal attention - # used in OpenAI GPT, we just need to prepare the broadcast dimension here. - extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :] + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + attention_mask = tf.cast(attention_mask, dtype=tf.float32) + num_dims_attention_mask = len(shape_list(attention_mask)) + if num_dims_attention_mask == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif num_dims_attention_mask == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder: + seq_ids = tf.range(seq_length) + causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)), + seq_ids[None, :, None]) + causal_mask = tf.cast(causal_mask, dtype=tf.float32) + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - - extended_attention_mask = tf.cast(extended_attention_mask, tf.float32) extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + if self.is_decoder: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + # Prepare head mask if needed # 1.0 in head_mask indicate we keep the head # attention_probs has shape bsz x n_heads x N x N @@ -148,14 +424,44 @@ class TFT5MainLayer(tf.keras.layers.Layer): head_mask = [None] * self.num_hidden_layers # head_mask = tf.constant([0] * self.num_hidden_layers) - ################################## - # Replace this with your model code - embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids) - encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training) - sequence_output = encoder_outputs[0] - outputs = (sequence_output,) + encoder_outputs[1:] # add hidden_states and attentions if they are here + all_hidden_states = () + all_attentions = () + position_bias = None + encoder_decoder_position_bias = None + for i, layer_module in enumerate(self.block): + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) - return outputs # sequence_output, (hidden_states), (attentions) + layer_outputs = layer_module(hidden_states, + attention_mask=extended_attention_mask, + position_bias=position_bias, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + encoder_decoder_position_bias=encoder_decoder_position_bias, + head_mask=head_mask[i], + training=training) + hidden_states = layer_outputs[0] + if i == 0: + position_bias = layer_outputs[2 if self.output_attentions else 1] + if self.is_decoder: + encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2] + + if self.output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.final_layer_norm(hidden_states) + layer_output = self.dropout(hidden_states, training=training) + + # Add last layer + if self.output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if self.output_hidden_states: + outputs = outputs + (all_hidden_states,) + if self.output_attentions: + outputs = outputs + (all_attentions,) + return outputs # last-layer hidden state, (all hidden states), (all attentions) #################################################### @@ -173,18 +479,26 @@ class TFT5PreTrainedModel(TFPreTrainedModel): pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP base_model_prefix = "transformer" + @property + def dummy_inputs(self): + input_ids = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) + input_mask = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + dummy_inputs = {'decoder_input_ids': input_ids, + 'encoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} + return dummy_inputs -T5_START_DOCSTRING = r""" The XXX model was proposed in - `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ - by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer - pre-trained using a combination of masked language modeling objective and next sentence prediction - on a large corpus comprising the Toronto Book Corpus and Wikipedia. + +T5_START_DOCSTRING = r""" The T5 model was proposed in + `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_ + by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu. + It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting. This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and behavior. - .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`: - https://arxiv.org/abs/1810.04805 + .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`: + https://arxiv.org/abs/1910.10683 .. _`tf.keras.Model`: https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model @@ -206,67 +520,50 @@ T5_START_DOCSTRING = r""" The XXX model was proposed in `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: - config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. + config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ -XXX_INPUTS_DOCSTRING = r""" +T5_INPUTS_DOCSTRING = r""" Inputs: **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: Indices of input sequence tokens in the vocabulary. - To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows: + To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows: (a) For sequence pairs: ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` (b) For single sequences: ``tokens: [CLS] the dog is hairy . [SEP]`` - - ``token_type_ids: 0 0 0 0 0 0 0`` - Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on - the right rather than the left. - Indices can be obtained using :class:`transformers.XxxTokenizer`. + T5 is a model with relative position embeddings so you should be able to pad the inputs on + the right or the left. + + Indices can be obtained using :class:`transformers.T5Tokenizer`. See :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. - **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: - Segment token indices to indicate first and second portions of the inputs. - Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` - corresponds to a `sentence B` token - (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). - **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``: - Indices of positions of each input sequence tokens in the position embeddings. - Selected in the range ``[0, config.max_position_embeddings - 1]``. **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. """ -@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.", - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) -class TFXxxModel(TFXxxPreTrainedModel): +@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states" + "without any specific head on top.", + T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +class TFT5Model(TFT5PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)`` Sequence of hidden-states at the output of the last layer of the model. - **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)`` - Last layer hidden-state of the first token of the sequence (classification token) - further processed by a Linear layer and a Tanh activation function. The Linear - layer weights are trained from the next sentence prediction (classification) - objective during Xxx pretraining. This output is usually *not* a good summary - of the semantic content of the input, you're often better with averaging or pooling - the sequence of hidden-states for the whole input sequence. **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) of shape ``(batch_size, sequence_length, hidden_size)``: @@ -278,27 +575,68 @@ class TFXxxModel(TFXxxPreTrainedModel): Examples:: import tensorflow as tf - from transformers import XxxTokenizer, TFXxxModel + from transformers import T5Tokenizer, TFT5Model - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxModel.from_pretrained('xxx-base-uncased') + tokenizer = T5Tokenizer.from_pretrained('t5-small') + model = TFT5Model.from_pretrained('t5-small') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxModel, self).__init__(config, *inputs, **kwargs) - self.transformer = TFXxxMainLayer(config, name='transformer') + super(TFT5Model, self).__init__(config, *inputs, **kwargs) + self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, + name='shared') - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) - return outputs + encoder_config = copy.deepcopy(config) + self.encoder = TFT5MainLayer(encoder_config, name='encoder') + + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + self.decoder = TFT5MainLayer(decoder_config, name='decoder') + + def call(self, decoder_input_ids, **kwargs): + # We allow two types of multi-inputs: + # - traditional keyword arguments in the call method + # - all the arguments provided as a dict in the first positional argument of call + # The last option is useful to use the tf.keras fit() method. + + if isinstance(decoder_input_ids, dict): + kwargs.update(decoder_input_ids) + else: + kwargs['decoder_input_ids'] = decoder_input_ids + + kwargs_common = dict((k, v) for k, v in kwargs.items() + if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_encoder = kwargs_common.copy() + kwargs_decoder = kwargs_common.copy() + kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) + + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] + else: + encoder_outputs = () + + # Decode + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) + decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) + + return decoder_outputs + encoder_outputs -@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) -class TFXxxForMaskedLM(TFXxxPreTrainedModel): +@add_start_docstrings("""T5 Model with a `language modeling` head on top. """, + T5_START_DOCSTRING, T5_INPUTS_DOCSTRING) +class TFT5WithLMHeadModel(TFT5PreTrainedModel): r""" Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` @@ -314,183 +652,66 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel): Examples:: import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForMaskedLM + from transformers import T5Tokenizer, TFT5WithLMHeadModel - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased') + tokenizer = T5Tokenizer.from_pretrained('t5-small') + model = TFT5WithLMHeadModel.from_pretrained('t5-small') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) prediction_scores = outputs[0] """ def __init__(self, config, *inputs, **kwargs): - super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs) + super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs) + self.model_dim = config.d_model - self.transformer = TFXxxMainLayer(config, name='transformer') - self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm') + self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model, + name='shared') - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) + encoder_config = copy.deepcopy(config) + self.encoder = TFT5MainLayer(encoder_config, name='encoder') - sequence_output = outputs[0] - prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False)) + decoder_config = copy.deepcopy(config) + decoder_config.is_decoder = True + self.decoder = TFT5MainLayer(decoder_config, name='decoder') - outputs = (prediction_scores,) + outputs[2:] # Add hidden states and attention if they are here + def call(self, decoder_input_ids, **kwargs): + # We allow two types of multi-inputs: + # - traditional keyword arguments in the call method + # - all the arguments provided as a dict in the first positional argument of call + # The last option is useful to use the tf.keras fit() method. - return outputs # prediction_scores, (hidden_states), (attentions) + if isinstance(decoder_input_ids, dict): + kwargs.update(decoder_input_ids) + else: + kwargs['decoder_input_ids'] = decoder_input_ids + kwargs_common = dict((k, v) for k, v in kwargs.items() + if not k.startswith("encoder_") and not k.startswith("decoder_")) + kwargs_encoder = kwargs_common.copy() + kwargs_decoder = kwargs_common.copy() + kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_"))) + kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_"))) -@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of - the pooled output) e.g. for GLUE tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) -class TFXxxForSequenceClassification(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)`` - Classification (or regression if config.num_labels==1) scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] + else: + encoder_outputs = () - Examples:: + # Decode + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) + decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForSequenceClassification + sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5) + lm_logits = self.shared(sequence_output, mode="linear") + decoder_outputs = (lm_logits,) + decoder_outputs[1:] - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - logits = outputs[0] - - """ - def __init__(self, config, *inputs, **kwargs): - super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - - self.transformer = TFXxxMainLayer(config, name='transformer') - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') - - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) - - pooled_output = outputs[1] - - pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False)) - logits = self.classifier(pooled_output) - - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - - return outputs # logits, (hidden_states), (attentions) - - -@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of - the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) -class TFXxxForTokenClassification(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` - Classification scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForTokenClassification - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - scores = outputs[0] - - """ - def __init__(self, config, *inputs, **kwargs): - super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - - self.transformer = TFXxxMainLayer(config, name='transformer') - self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) - self.classifier = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='classifier') - - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) - - sequence_output = outputs[0] - - sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False)) - logits = self.classifier(sequence_output) - - outputs = (logits,) + outputs[2:] # add hidden states and attention if they are here - - return outputs # scores, (hidden_states), (attentions) - - -@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of - the hidden-states output to compute `span start logits` and `span end logits`). """, - XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING) -class TFXxxForQuestionAnswering(TFXxxPreTrainedModel): - r""" - Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: - **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` - Span-start scores (before SoftMax). - **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)`` - Span-end scores (before SoftMax). - **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings) - of shape ``(batch_size, sequence_length, hidden_size)``: - Hidden-states of the model at the output of each layer plus the initial embedding outputs. - **attentions**: (`optional`, returned when ``config.output_attentions=True``) - list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. - - Examples:: - - import tensorflow as tf - from transformers import XxxTokenizer, TFXxxForQuestionAnswering - - tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased') - model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased') - input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) - start_scores, end_scores = outputs[:2] - - """ - def __init__(self, config, *inputs, **kwargs): - super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs) - self.num_labels = config.num_labels - - self.transformer = TFXxxMainLayer(config, name='transformer') - self.qa_outputs = tf.keras.layers.Dense(config.num_labels, - kernel_initializer=get_initializer(config.initializer_range), - name='qa_outputs') - - def call(self, inputs, **kwargs): - outputs = self.transformer(inputs, **kwargs) - - sequence_output = outputs[0] - - logits = self.qa_outputs(sequence_output) - start_logits, end_logits = tf.split(logits, 2, axis=-1) - start_logits = tf.squeeze(start_logits, axis=-1) - end_logits = tf.squeeze(end_logits, axis=-1) - - outputs = (start_logits, end_logits,) + outputs[2:] - - return outputs # start_logits, end_logits, (hidden_states), (attentions) + return decoder_outputs + encoder_outputs diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 063f52365d..5b1d3bb458 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -160,8 +160,7 @@ class PreTrainedModel(nn.Module): base_model.vocab_size = new_num_tokens # Tie weights again if needed - if hasattr(self, 'tie_weights'): - self.tie_weights() + self.tie_weights() return model_embeds @@ -458,8 +457,7 @@ class PreTrainedModel(nn.Module): raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format( model.__class__.__name__, "\n\t".join(error_msgs))) - if hasattr(model, 'tie_weights'): - model.tie_weights() # make sure word embedding weights are still tied + model.tie_weights() # make sure word embedding weights are still tied if needed # Set model in evaluation mode to desactivate DropOut modules by default model.eval() diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index f636c42889..6c3954a088 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -69,6 +69,7 @@ class TFCommonTestCases: test_torchscript = True test_pruning = True test_resize_embeddings = True + is_encoder_decoder = False def test_initialization(self): pass @@ -156,7 +157,11 @@ class TFCommonTestCases: def test_compile_tf_model(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32') + if self.is_encoder_decoder: + input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'), + 'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')} + else: + input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32') optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0) loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') @@ -189,7 +194,7 @@ class TFCommonTestCases: outputs_dict = model(inputs_dict) inputs_keywords = copy.deepcopy(inputs_dict) - input_ids = inputs_keywords.pop('input_ids') + input_ids = inputs_keywords.pop('input_ids', inputs_keywords.pop('decoder_input_ids')) outputs_keywords = model(input_ids, **inputs_keywords) output_dict = outputs_dict[0].numpy() @@ -216,12 +221,24 @@ class TFCommonTestCases: self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) out_len = len(outputs) + if self.is_encoder_decoder: + self.assertEqual(out_len % 2, 0) + decoder_attentions = outputs[(out_len // 2)-1] + self.assertEqual(model.config.output_attentions, True) + self.assertEqual(model.config.output_hidden_states, False) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, + self.model_tester.seq_length, + self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + # Check attention is always last and order is fine config.output_attentions = True config.output_hidden_states = True model = model_class(config) outputs = model(inputs_dict) - self.assertEqual(out_len+1, len(outputs)) + self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py index fac6763432..33f6f895f0 100644 --- a/transformers/tests/modeling_tf_t5_test.py +++ b/transformers/tests/modeling_tf_t5_test.py @@ -26,7 +26,7 @@ from .configuration_common_test import ConfigTester from transformers import T5Config, is_tf_available -if False: # is_tf_available(): +if is_tf_available(): import tensorflow as tf from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) else: @@ -35,7 +35,8 @@ else: class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): - all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if False else () # is_tf_available() else () + is_encoder_decoder = True + all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else () class TFT5ModelTester(object): @@ -45,22 +46,16 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): seq_length=7, is_training=True, use_input_mask=True, - use_token_type_ids=True, use_labels=True, vocab_size=99, + n_positions=14, hidden_size=32, num_hidden_layers=5, num_attention_heads=4, - intermediate_size=37, - hidden_act="gelu", - hidden_dropout_prob=0.1, - attention_probs_dropout_prob=0.1, - max_position_embeddings=512, - type_vocab_size=16, - type_sequence_label_size=2, - initializer_range=0.02, - num_labels=3, - num_choices=4, + d_ff=37, + relative_attention_num_buckets=8, + dropout_rate=0.1, + initializer_factor=0.002, scope=None, ): self.parent = parent @@ -68,22 +63,16 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): self.seq_length = seq_length self.is_training = is_training self.use_input_mask = use_input_mask - self.use_token_type_ids = use_token_type_ids self.use_labels = use_labels self.vocab_size = vocab_size + self.n_positions = n_positions self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers self.num_attention_heads = num_attention_heads - self.intermediate_size = intermediate_size - self.hidden_act = hidden_act - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.type_sequence_label_size = type_sequence_label_size - self.initializer_range = initializer_range - self.num_labels = num_labels - self.num_choices = num_choices + self.d_ff = d_ff + self.relative_attention_num_buckets = relative_attention_num_buckets + self.dropout_rate = dropout_rate + self.initializer_factor = initializer_factor self.scope = scope def prepare_config_and_inputs(self): @@ -93,61 +82,53 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): if self.use_input_mask: input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) - token_type_ids = None - if self.use_token_type_ids: - token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) - - sequence_labels = None token_labels = None - choice_labels = None if self.use_labels: - sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) - token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) - choice_labels = ids_tensor([self.batch_size], self.num_choices) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = T5Config( vocab_size_or_config_json_file=self.vocab_size, - hidden_size=self.hidden_size, - num_hidden_layers=self.num_hidden_layers, - num_attention_heads=self.num_attention_heads, - intermediate_size=self.intermediate_size, - hidden_act=self.hidden_act, - hidden_dropout_prob=self.hidden_dropout_prob, - attention_probs_dropout_prob=self.attention_probs_dropout_prob, - max_position_embeddings=self.max_position_embeddings, - type_vocab_size=self.type_vocab_size, - initializer_range=self.initializer_range) + n_positions=self.n_positions, + d_model=self.hidden_size, + d_ff=self.d_ff, + d_kv=self.hidden_size // self.num_attention_heads, + num_layers=self.num_hidden_layers, + num_heads=self.num_attention_heads, + relative_attention_num_buckets=self.relative_attention_num_buckets, + dropout_rate=self.dropout_rate, + initializer_factor=self.initializer_factor) - return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + return (config, input_ids, input_mask, token_labels) - def create_and_check_t5_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): model = TFT5Model(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - sequence_output, pooled_output = model(inputs) + inputs = {'encoder_input_ids': input_ids, + 'decoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} + encoder_output, decoder_output = model(inputs) - inputs = [input_ids, input_mask] - sequence_output, pooled_output = model(inputs) - - sequence_output, pooled_output = model(input_ids) + encoder_output, decoder_output = model(input_ids, + decoder_attention_mask=input_mask, + encoder_input_ids=input_ids) result = { - "sequence_output": sequence_output.numpy(), - "pooled_output": pooled_output.numpy(), + "encoder_output": encoder_output.numpy(), + "decoder_output": decoder_output.numpy(), } self.parent.assertListEqual( - list(result["sequence_output"].shape), + list(result["encoder_output"].shape), + [self.batch_size, self.seq_length, self.hidden_size]) + self.parent.assertListEqual( + list(result["decoder_output"].shape), [self.batch_size, self.seq_length, self.hidden_size]) - self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size]) - def create_and_check_t5_with_lm_head(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): + def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): model = TFT5WithLMHeadModel(config=config) - inputs = {'input_ids': input_ids, - 'attention_mask': input_mask, - 'token_type_ids': token_type_ids} - prediction_scores, = model(inputs) + inputs = {'encoder_input_ids': input_ids, + 'decoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} + prediction_scores, decoder_output = model(inputs) result = { "prediction_scores": prediction_scores.numpy(), } @@ -158,14 +139,15 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, token_type_ids, input_mask, - sequence_labels, token_labels, choice_labels) = config_and_inputs - inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask} + (config, input_ids, input_mask, token_labels) = config_and_inputs + inputs_dict = {'encoder_input_ids': input_ids, + 'decoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} return config, inputs_dict def setUp(self): self.model_tester = TFT5ModelTest.TFT5ModelTester(self) - self.config_tester = ConfigTester(self, config_class=T5Config, hidden_size=37) + self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37) def test_config(self): self.config_tester.run_common_tests() @@ -181,7 +163,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): @pytest.mark.slow def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" - for model_name in ['t5-base']: + for model_name in ['t5-small']: model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir) shutil.rmtree(cache_dir) self.assertIsNotNone(model) From 4321c541254bdabbda631520cff0a5a376ad9f48 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 11:49:32 +0100 Subject: [PATCH 017/302] fix tests --- transformers/tests/modeling_tf_common_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index 6c3954a088..83a15c137a 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -194,7 +194,7 @@ class TFCommonTestCases: outputs_dict = model(inputs_dict) inputs_keywords = copy.deepcopy(inputs_dict) - input_ids = inputs_keywords.pop('input_ids', inputs_keywords.pop('decoder_input_ids')) + input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None) outputs_keywords = model(input_ids, **inputs_keywords) output_dict = outputs_dict[0].numpy() From f03c0c1423d4635f3e71a6c24053f01f6f02063c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 11:49:46 +0100 Subject: [PATCH 018/302] adding models in readme and auto classes --- README.md | 3 ++- docs/source/pretrained_models.rst | 20 +++++++++++++++++++ transformers/__main__.py | 18 +++++++++++++++++ .../convert_pytorch_checkpoint_to_tf2.py | 13 ++++++++---- transformers/modeling_auto.py | 13 ++++++++++-- transformers/modeling_tf_auto.py | 13 ++++++++++-- transformers/tokenization_auto.py | 7 ++++++- 7 files changed, 77 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 40b08583b1..d6f6e426d8 100644 --- a/README.md +++ b/README.md @@ -122,7 +122,8 @@ At some point in the future, you'll be able to seamlessly move from pre-training 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation). 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. -10. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. +10. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. +11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html). diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 43c08228bd..c6240dc850 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -144,5 +144,25 @@ Here is the full list of the currently provided pretrained models together with | CTRL | ``ctrl`` | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters | | | | | Salesforce's Large-sized CTRL English model | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | +| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``t5-base`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | +| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``t5-large`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | +| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``t5-3b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | +| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``t5-11b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | +| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | +| | | (see `details `__) | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ .. `__ \ No newline at end of file diff --git a/transformers/__main__.py b/transformers/__main__.py index 31dbd24908..6136d768f6 100644 --- a/transformers/__main__.py +++ b/transformers/__main__.py @@ -6,6 +6,7 @@ def main(): "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" "It should be used as one of: \n" ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" + ">> transformers t5 TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" @@ -21,6 +22,23 @@ def main(): "https://www.tensorflow.org/install/ for installation instructions.") raise + if len(sys.argv) != 5: + # pylint: disable=line-too-long + print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") + else: + PYTORCH_DUMP_OUTPUT = sys.argv.pop() + TF_CONFIG = sys.argv.pop() + TF_CHECKPOINT = sys.argv.pop() + convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) + elif sys.argv[1] == "t5": + try: + from .convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + except ImportError: + print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " + "In that case, it requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions.") + raise + if len(sys.argv) != 5: # pylint: disable=line-too-long print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index e673b77dcc..19629172ff 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -33,7 +33,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model, OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP) + CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, + T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP) if is_torch_available(): import torch @@ -46,7 +47,8 @@ if is_torch_available(): OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) + CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) else: (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, @@ -56,7 +58,8 @@ else: OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) = ( + CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = ( None, None, None, None, None, None, None, None, @@ -65,6 +68,7 @@ else: None, None, None, None, None, None, None, None, + None, None, None, None) @@ -85,7 +89,8 @@ MODEL_CLASSES = { 'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), - 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP) + 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP), + 't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP), } def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True): diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index d98110d4bd..a2129176d3 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -27,6 +27,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification +from .modeling_t5 import T5Model, T5WithLMHeadModel from .modeling_utils import PreTrainedModel, SequenceSummary @@ -47,6 +48,7 @@ class AutoModel(object): The base model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5Model (T5 model) - contains `distilbert`: DistilBertModel (DistilBERT model) - contains `roberta`: RobertaModel (RoBERTa model) - contains `bert`: BertModel (Bert model) @@ -70,6 +72,7 @@ class AutoModel(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5Model (T5 model) - contains `distilbert`: DistilBertModel (DistilBERT model) - contains `roberta`: RobertaModel (RoBERTa model) - contains `bert`: BertModel (Bert model) @@ -136,7 +139,9 @@ class AutoModel(object): model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) @@ -171,6 +176,7 @@ class AutoModelWithLMHead(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5ModelWithLMHead (T5 model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model) - contains `bert`: BertForMaskedLM (Bert model) @@ -197,6 +203,7 @@ class AutoModelWithLMHead(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5ModelWithLMHead (T5 model) - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model) - contains `bert`: BertForMaskedLM (Bert model) @@ -262,7 +269,9 @@ class AutoModelWithLMHead(object): model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index df0ad6e401..b24623dcdc 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel +from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel from .file_utils import add_start_docstrings @@ -45,6 +46,7 @@ class TFAutoModel(object): The base model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: TFT5Model (T5 model) - contains `distilbert`: TFDistilBertModel (DistilBERT model) - contains `roberta`: TFRobertaModel (RoBERTa model) - contains `bert`: TFBertModel (Bert model) @@ -68,6 +70,7 @@ class TFAutoModel(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: TFT5Model (T5 model) - contains `distilbert`: TFDistilBertModel (DistilBERT model) - contains `roberta`: TFRobertaModel (RoBERTa model) - contains `bert`: TFTFBertModel (Bert model) @@ -133,7 +136,9 @@ class TFAutoModel(object): model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) @@ -169,6 +174,7 @@ class TFAutoModelWithLMHead(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: TFT5WithLMHeadModel (T5 model) - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model) - contains `bert`: TFBertForMaskedLM (Bert model) @@ -195,6 +201,7 @@ class TFAutoModelWithLMHead(object): The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: TFT5WithLMHeadModel (T5 model) - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model) - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model) - contains `bert`: TFBertForMaskedLM (Bert model) @@ -261,7 +268,9 @@ class TFAutoModelWithLMHead(object): model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config) """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index ec056de17f..5be2562448 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer from .tokenization_xlm import XLMTokenizer from .tokenization_roberta import RobertaTokenizer from .tokenization_distilbert import DistilBertTokenizer +from .tokenization_t5 import T5Tokenizer logger = logging.getLogger(__name__) @@ -41,6 +42,7 @@ class AutoTokenizer(object): The tokenizer class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5Tokenizer (T5 model) - contains `distilbert`: DistilBertTokenizer (DistilBert model) - contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `bert`: BertTokenizer (Bert model) @@ -64,6 +66,7 @@ class AutoTokenizer(object): The tokenizer class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): + - contains `t5`: T5Tokenizer (T5 model) - contains `distilbert`: DistilBertTokenizer (DistilBert model) - contains `roberta`: RobertaTokenizer (XLM model) - contains `bert`: BertTokenizer (Bert model) @@ -101,7 +104,9 @@ class AutoTokenizer(object): tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` """ - if 'distilbert' in pretrained_model_name_or_path: + if 't5' in pretrained_model_name_or_path: + return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) From 15e53c4e8712260b016225310c397e19a5f7b21c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 12:43:21 +0100 Subject: [PATCH 019/302] maybe fix tests --- transformers/tests/modeling_tf_common_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index 83a15c137a..20ccfd8ce0 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -131,7 +131,11 @@ class TFCommonTestCases: with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) - max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) + tfo = tfo[0].numpy() + pto = pto[0].numpy() + tfo[np.isnan(tfo)] = 0 + pto[np.isnan(pto)] = 0 + max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 2e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions @@ -151,7 +155,11 @@ class TFCommonTestCases: with torch.no_grad(): pto = pt_model(**pt_inputs_dict) tfo = tf_model(inputs_dict) - max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy())) + tfo = tfo[0].numpy() + pto = pto[0].numpy() + tfo[np.isnan(tfo)] = 0 + pto[np.isnan(pto)] = 0 + max_diff = np.amax(np.abs(tfo - pto)) self.assertLessEqual(max_diff, 2e-2) def test_compile_tf_model(self): From b4fcd59a5ae8d12102db106d3b03849ef86109bd Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 14:38:53 +0100 Subject: [PATCH 020/302] add sentinels in tokenizer --- transformers/tokenization_t5.py | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 93842d29f0..3847aeefbf 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging import os +import re import six from shutil import copyfile @@ -31,7 +32,7 @@ SPIECE_UNDERLINE = u'▁' # Mapping from the keyword arguments names of Tokenizer `__init__` # to file names for serializing Tokenizer instances #################################################### -VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} +VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} #################################################### # Mapping from the keyword arguments names of Tokenizer `__init__` @@ -56,15 +57,27 @@ class T5Tokenizer(PreTrainedTokenizer): SentencePiece based tokenizer. Peculiarities: - requires `SentencePiece `_ + - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels. + These tokens are accessible as `` where `{%d}` is a number between 0 and extra_ids-1. + Extra tokens are indexed from the end of the vocabulary up to beginnning ( is the last token in the vocabulary) + (like in T5 preprocessing + see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117) """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES def __init__(self, vocab_file, eos_token="", unk_token="", - pad_token="", **kwargs): + pad_token="", extra_ids=100, additional_special_tokens=None, **kwargs): + # Add extra_ids to the special token list + if extra_ids > 0: + if additional_special_tokens is None: + additional_special_tokens = [] + additional_special_tokens.extend([u"".format(i) for i in range(extra_ids)]) + super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token, - pad_token=pad_token, **kwargs) + pad_token=pad_token, additional_special_tokens=additional_special_tokens, + **kwargs) try: import sentencepiece as spm @@ -74,13 +87,14 @@ class T5Tokenizer(PreTrainedTokenizer): "pip install sentencepiece") self.vocab_file = vocab_file + self._extra_ids = extra_ids self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(vocab_file) @property def vocab_size(self): - return self.sp_model.get_piece_size() + return self.sp_model.get_piece_size() + self._extra_ids def __getstate__(self): state = self.__dict__.copy() @@ -118,11 +132,18 @@ class T5Tokenizer(PreTrainedTokenizer): def _convert_token_to_id(self, token): """ Converts a token (str/unicode) in an id using the vocab. """ + if token.startswith(u"', token) + num = int(l[1]) + return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) def _convert_id_to_token(self, index, return_unicode=True): """Converts an index (integer) in a token (string/unicode) using the vocab.""" - token = self.sp_model.IdToPiece(index) + if index < self.sp_model.get_piece_size(): + token = self.sp_model.IdToPiece(index) + else: + token = u"".format(self.vocab_size - 1 - index) if six.PY2 and return_unicode and isinstance(token, str): token = token.decode('utf-8') return token From 268d4f2099f90bb62949988c3b78596242e1d753 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 8 Nov 2019 16:41:55 +0100 Subject: [PATCH 021/302] fix position biases + better tests --- transformers/modeling_t5.py | 11 +++-- transformers/tests/modeling_t5_test.py | 62 +++++++++++++++----------- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 6be0ae6863..2a74333d31 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -408,7 +408,7 @@ class T5Block(nn.Module): position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] - outputs = self_attention_outputs[1:] + outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights if not self.is_decoder: hidden_states = self.layer[1](hidden_states) @@ -419,11 +419,11 @@ class T5Block(nn.Module): position_bias=encoder_decoder_position_bias, head_mask=head_mask) hidden_states = cross_attention_outputs[0] - outputs = cross_attention_outputs[1:] + outputs + outputs = outputs + cross_attention_outputs[1:] # Keep cross-attention outputs and relative position weights hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them - return outputs + return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class T5PreTrainedModel(PreTrainedModel): @@ -564,14 +564,17 @@ class T5Stack(T5PreTrainedModel): encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i]) + # layer_outputs is a tuple with: + # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states = layer_outputs[0] if i == 0: + # We share the position biases between the layers - the first layer store them position_bias = layer_outputs[2 if self.output_attentions else 1] if self.is_decoder: encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2] if self.output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) + all_attentions = all_attentions + (layer_outputs[1],) # We keep only self-attention weights for now hidden_states = self.final_layer_norm(hidden_states) layer_output = self.dropout(hidden_states) diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index 2c67b83c25..091bd742b5 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -45,9 +45,10 @@ class T5ModelTest(CommonTestCases.CommonModelTester): def __init__(self, parent, batch_size=13, - seq_length=7, + encoder_seq_length=7, + decoder_seq_length=9, is_training=True, - use_input_mask=True, + use_attention_mask=True, use_labels=True, vocab_size=99, n_positions=14, @@ -62,9 +63,10 @@ class T5ModelTest(CommonTestCases.CommonModelTester): ): self.parent = parent self.batch_size = batch_size - self.seq_length = seq_length + self.encoder_seq_length = encoder_seq_length + self.decoder_seq_length = decoder_seq_length self.is_training = is_training - self.use_input_mask = use_input_mask + self.use_attention_mask = use_attention_mask self.use_labels = use_labels self.vocab_size = vocab_size self.n_positions = n_positions @@ -78,15 +80,18 @@ class T5ModelTest(CommonTestCases.CommonModelTester): self.scope = scope def prepare_config_and_inputs(self): - input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size) + decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) - input_mask = None - if self.use_input_mask: - input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + encoder_attention_mask = None + decoder_attention_mask = None + if self.use_attention_mask: + encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2) + decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2) - token_labels = None + decoder_lm_labels = None if self.use_labels: - token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) config = T5Config( vocab_size_or_config_json_file=self.vocab_size, @@ -100,21 +105,22 @@ class T5ModelTest(CommonTestCases.CommonModelTester): dropout_rate=self.dropout_rate, initializer_factor=self.initializer_factor) - return (config, input_ids, input_mask, token_labels) + return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels) def check_loss_output(self, result): self.parent.assertListEqual( list(result["loss"].size()), []) - def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels): + def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels): model = T5Model(config=config) model.eval() - encoder_output, decoder_output = model(encoder_input_ids=input_ids, - decoder_input_ids=input_ids, - decoder_attention_mask=input_mask) - encoder_output, decoder_output = model(encoder_input_ids=input_ids, - decoder_input_ids=input_ids) + decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids, + decoder_input_ids=decoder_input_ids, + encoder_attention_mask=encoder_attention_mask, + decoder_attention_mask=decoder_attention_mask) + decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids, + decoder_input_ids=decoder_input_ids) result = { "encoder_output": encoder_output, @@ -122,17 +128,17 @@ class T5ModelTest(CommonTestCases.CommonModelTester): } self.parent.assertListEqual( list(result["encoder_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + [self.batch_size, self.encoder_seq_length, self.hidden_size]) self.parent.assertListEqual( list(result["decoder_output"].size()), - [self.batch_size, self.seq_length, self.hidden_size]) + [self.batch_size, self.decoder_seq_length, self.hidden_size]) - def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels): + def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels): model = T5WithLMHeadModel(config=config) model.eval() - outputs = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids, - decoder_attention_mask=input_mask, decoder_lm_labels=token_labels) + outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels) loss, prediction_scores = outputs[0], outputs[1] result = { "loss": loss, @@ -140,15 +146,17 @@ class T5ModelTest(CommonTestCases.CommonModelTester): } self.parent.assertListEqual( list(result["prediction_scores"].size()), - [self.batch_size, self.seq_length, self.vocab_size]) + [self.batch_size, self.decoder_seq_length, self.vocab_size]) self.check_loss_output(result) def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() - (config, input_ids, input_mask, token_labels) = config_and_inputs - inputs_dict = {'encoder_input_ids': input_ids, - 'decoder_input_ids': input_ids, - 'decoder_attention_mask': input_mask} + (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, + decoder_attention_mask, decoder_lm_labels) = config_and_inputs + inputs_dict = {'encoder_input_ids': encoder_input_ids, + 'decoder_input_ids': decoder_input_ids, + 'decoder_attention_mask': decoder_attention_mask, + 'encoder_attention_mask': encoder_attention_mask} return config, inputs_dict def setUp(self): From 7da3ef24cd6911e168ae52eddb8b23f9fd61e6d4 Mon Sep 17 00:00:00 2001 From: Xu Hongshen <1505746949@qq.com> Date: Tue, 12 Nov 2019 16:11:24 +0800 Subject: [PATCH 022/302] add is_impossible tensor to model inputs during fine-tuning xlnet on squad2.0 --- examples/run_squad.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index d7fdc32ae7..bd44979e44 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -147,6 +147,8 @@ def train(args, train_dataset, model, tokenizer): if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) + if args.version_2_with_negative: + inputs.update({'is_impossible': batch[7]}) outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) @@ -339,9 +341,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + all_is_impossible = torch.tensor([1.0 if f.is_impossible == True else 0.0 for f in features], dtype=torch.float) dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_start_positions, all_end_positions, - all_cls_index, all_p_mask) + all_cls_index, all_p_mask, all_is_impossible) if output_examples: return dataset, examples, features From ca99a2d5009c32b4c718e83ed47adacf525261dc Mon Sep 17 00:00:00 2001 From: Xu Hongshen <1505746949@qq.com> Date: Fri, 15 Nov 2019 14:55:26 +0800 Subject: [PATCH 023/302] Update example readme --- examples/README.md | 63 +++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 57 insertions(+), 6 deletions(-) diff --git a/examples/README.md b/examples/README.md index abb4cb6e5a..638a9fb056 100644 --- a/examples/README.md +++ b/examples/README.md @@ -32,7 +32,7 @@ Quick benchmarks from the script (no other modifications): | Titan V | AMP | 26s | 0.8281/0.8568/0.8411 | | V100 | FP32 | 35s | 0.8646/0.8359/0.8464 | | V100 | AMP | 22s | 0.8646/0.8385/0.8411 | -| 1080 Ti | FP32 | 55s | - | +| 1080 Ti | FP32 | 55s | - | Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used). @@ -346,9 +346,9 @@ eval_loss = 0.44457291918821606 Based on the script [`run_squad.py`](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py). -#### Fine-tuning on SQuAD +#### Fine-tuning BERT on SQuAD1.0 -This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) +This example code fine-tunes BERT on the SQuAD1.0 dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB. The data for SQuAD can be downloaded with the following links and should be saved in a $SQUAD_DIR directory. @@ -356,6 +356,12 @@ $SQUAD_DIR directory. * [dev-v1.1.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json) * [evaluate-v1.1.py](https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py) +And for SQuAD2.0, you need to download: + +- [train-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json) +- [dev-v2.0.json](https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json) +- [evaluate-v2.0.py](https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/) + ```bash export SQUAD_DIR=/path/to/SQUAD @@ -385,7 +391,7 @@ exact_match = 81.22 #### Distributed training -Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD: +Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.0: ```bash python -m torch.distributed.launch --nproc_per_node=8 run_squad.py \ @@ -417,7 +423,9 @@ This fine-tuned model is available as a checkpoint under the reference #### Fine-tuning XLNet on SQuAD -This example code fine-tunes XLNet on the SQuAD dataset. See above to download the data for SQuAD . +This example code fine-tunes XLNet on both SQuAD1.0 and SQuAD2.0 dataset. See above to download the data for SQuAD . + +##### Command for SQuAD1.0: ```bash export SQUAD_DIR=/path/to/SQUAD @@ -440,7 +448,32 @@ python /data/home/hlu/transformers/examples/run_squad.py \ --save_steps 5000 ``` -Training with the previously defined hyper-parameters yields the following results: +##### Command for SQuAD2.0: + +```bash +export SQUAD_DIR=/path/to/SQUAD + +python run_squad.py \ + --model_type xlnet \ + --model_name_or_path xlnet-large-cased \ + --do_train \ + --do_eval \ + --version_2_with_negative \ + --train_file $SQUAD_DIR/train-v2.0.json \ + --predict_file $SQUAD_DIR/dev-v2.0.json \ + --learning_rate 3e-5 \ + --num_train_epochs 4 \ + --max_seq_length 384 \ + --doc_stride 128 \ + --output_dir ./wwm_cased_finetuned_squad/ \ + --per_gpu_eval_batch_size=2 \ + --per_gpu_train_batch_size=2 \ + --save_steps 5000 +``` + +Larger batch size may improve the performance while costing more memory. + +##### Results for SQuAD1.0 with the previously defined hyper-parameters: ```python { @@ -453,6 +486,24 @@ Training with the previously defined hyper-parameters yields the following resul } ``` +##### Results for SQuAD2.0 with the previously defined hyper-parameters: + +```python +{ +"exact": 80.4177545691906, +"f1": 84.07154997729623, +"total": 11873, +"HasAns_exact": 76.73751686909581, +"HasAns_f1": 84.05558584352873, +"HasAns_total": 5928, +"NoAns_exact": 84.0874684608915, +"NoAns_f1": 84.0874684608915, +"NoAns_total": 5945 +} +``` + + + ## Named Entity Recognition Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py). From ea52f82455a7ca0f979768204dfeb38b5fff13ad Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 18 Nov 2019 14:42:59 -0500 Subject: [PATCH 024/302] Moved some SQuAD logic to /data --- transformers/__init__.py | 3 +- transformers/data/__init__.py | 3 +- transformers/data/processors/__init__.py | 1 + transformers/data/processors/squad.py | 318 +++++++++++++++++++++++ 4 files changed, 323 insertions(+), 2 deletions(-) create mode 100644 transformers/data/processors/squad.py diff --git a/transformers/__init__.py b/transformers/__init__.py index 5c7b0a6197..b859e18c53 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -25,7 +25,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, glue_output_modes, glue_convert_examples_to_features, - glue_processors, glue_tasks_num_labels) + glue_processors, glue_tasks_num_labels, + squad_convert_examples_to_features, SquadFeatures) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index e910d6da2e..827d96ed29 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,5 +1,6 @@ -from .processors import InputExample, InputFeatures, DataProcessor +from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features +from .processors import squad_convert_examples_to_features from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index af38c54beb..4e322a2ca8 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,3 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features +from .squad import squad_convert_examples_to_features, SquadFeatures diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py new file mode 100644 index 0000000000..c1a1034f17 --- /dev/null +++ b/transformers/data/processors/squad.py @@ -0,0 +1,318 @@ +from tqdm import tqdm +import collections +import logging +import os + +from .utils import DataProcessor, InputExample, InputFeatures +from ...file_utils import is_tf_available + +if is_tf_available(): + import tensorflow as tf + +logger = logging.getLogger(__name__) + +def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, + doc_stride, max_query_length, is_training, + cls_token_at_end=False, + cls_token='[CLS]', sep_token='[SEP]', pad_token=0, + sequence_a_segment_id=0, sequence_b_segment_id=1, + cls_token_segment_id=0, pad_token_segment_id=0, + mask_padding_with_zero=True, + sequence_a_is_doc=False): + """Loads a data file into a list of `InputBatch`s.""" + + # Defining helper methods + def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + unique_id = 1000000000 + + features = [] + for (example_index, example) in enumerate(tqdm(examples)): + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + tok_start_position = None + tok_end_position = None + if is_training and example.is_impossible: + tok_start_position = -1 + tok_end_position = -1 + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, + example.orig_answer_text) + + # The -3 accounts for [CLS], [SEP] and [SEP] + max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 + + # We can have documents that are longer than the maximum sequence length. + # To deal with this we do a sliding window approach, where we take chunks + # of the up to our max length with a stride of `doc_stride`. + _DocSpan = collections.namedtuple( # pylint: disable=invalid-name + "DocSpan", ["start", "length"]) + doc_spans = [] + start_offset = 0 + while start_offset < len(all_doc_tokens): + length = len(all_doc_tokens) - start_offset + if length > max_tokens_for_doc: + length = max_tokens_for_doc + doc_spans.append(_DocSpan(start=start_offset, length=length)) + if start_offset + length == len(all_doc_tokens): + break + start_offset += min(length, doc_stride) + + for (doc_span_index, doc_span) in enumerate(doc_spans): + tokens = [] + token_to_orig_map = {} + token_is_max_context = {} + segment_ids = [] + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = [] + + # CLS token at the beginning + if not cls_token_at_end: + tokens.append(cls_token) + segment_ids.append(cls_token_segment_id) + p_mask.append(0) + cls_index = 0 + + # XLNet: P SEP Q SEP CLS + # Others: CLS Q SEP P SEP + if not sequence_a_is_doc: + # Query + tokens += query_tokens + segment_ids += [sequence_a_segment_id] * len(query_tokens) + p_mask += [1] * len(query_tokens) + + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_a_segment_id) + p_mask.append(1) + + # Paragraph + for i in range(doc_span.length): + split_token_index = doc_span.start + i + token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + is_max_context = _check_is_max_context(doc_spans, doc_span_index, + split_token_index) + token_is_max_context[len(tokens)] = is_max_context + tokens.append(all_doc_tokens[split_token_index]) + if not sequence_a_is_doc: + segment_ids.append(sequence_b_segment_id) + else: + segment_ids.append(sequence_a_segment_id) + p_mask.append(0) + paragraph_len = doc_span.length + + if sequence_a_is_doc: + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_a_segment_id) + p_mask.append(1) + + tokens += query_tokens + segment_ids += [sequence_b_segment_id] * len(query_tokens) + p_mask += [1] * len(query_tokens) + + # SEP token + tokens.append(sep_token) + segment_ids.append(sequence_b_segment_id) + p_mask.append(1) + + # CLS token at the end + if cls_token_at_end: + tokens.append(cls_token) + segment_ids.append(cls_token_segment_id) + p_mask.append(0) + cls_index = len(tokens) - 1 # Index of classification token + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(pad_token) + input_mask.append(0 if mask_padding_with_zero else 1) + segment_ids.append(pad_token_segment_id) + p_mask.append(1) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + span_is_impossible = example.is_impossible + start_position = None + end_position = None + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = doc_span.start + doc_end = doc_span.start + doc_span.length - 1 + out_of_span = False + if not (tok_start_position >= doc_start and + tok_end_position <= doc_end): + out_of_span = True + if out_of_span: + start_position = 0 + end_position = 0 + span_is_impossible = True + else: + if sequence_a_is_doc: + doc_offset = 0 + else: + doc_offset = len(query_tokens) + 2 + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + if is_training and span_is_impossible: + start_position = cls_index + end_position = cls_index + + if example_index < 20: + logger.info("*** Example ***") + logger.info("unique_id: %s" % (unique_id)) + logger.info("example_index: %s" % (example_index)) + logger.info("doc_span_index: %s" % (doc_span_index)) + logger.info("tokens: %s" % " ".join(tokens)) + logger.info("token_to_orig_map: %s" % " ".join([ + "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + logger.info("token_is_max_context: %s" % " ".join([ + "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + ])) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info( + "input_mask: %s" % " ".join([str(x) for x in input_mask])) + logger.info( + "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + if is_training and span_is_impossible: + logger.info("impossible example") + if is_training and not span_is_impossible: + answer_text = " ".join(tokens[start_position:(end_position + 1)]) + logger.info("start_position: %d" % (start_position)) + logger.info("end_position: %d" % (end_position)) + logger.info( + "answer: %s" % (answer_text)) + + features.append( + SquadFeatures( + unique_id=unique_id, + example_index=example_index, + doc_span_index=doc_span_index, + tokens=tokens, + token_to_orig_map=token_to_orig_map, + token_is_max_context=token_is_max_context, + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + cls_index=cls_index, + p_mask=p_mask, + paragraph_len=paragraph_len, + start_position=start_position, + end_position=end_position, + is_impossible=span_is_impossible)) + unique_id += 1 + + return features + +class SquadFeatures(object): + """A single set of features of data.""" + + def __init__(self, + unique_id, + example_index, + doc_span_index, + tokens, + token_to_orig_map, + token_is_max_context, + input_ids, + input_mask, + segment_ids, + cls_index, + p_mask, + paragraph_len, + start_position=None, + end_position=None, + is_impossible=None): + self.unique_id = unique_id + self.example_index = example_index + self.doc_span_index = doc_span_index + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + self.token_is_max_context = token_is_max_context + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.cls_index = cls_index + self.p_mask = p_mask + self.paragraph_len = paragraph_len + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __eq__(self, other): + return self.cls_index == other.cls_index and \ + self.doc_span_index == other.doc_span_index and \ + self.end_position == other.end_position and \ + self.example_index == other.example_index and \ + self.input_ids == other.input_ids and \ + self.input_mask == other.input_mask and \ + self.is_impossible == other.is_impossible and \ + self.p_mask == other.p_mask and \ + self.paragraph_len == other.paragraph_len and \ + self.segment_ids == other.segment_ids and \ + self.start_position == other.start_position and \ + self.token_is_max_context == other.token_is_max_context and \ + self.token_to_orig_map == other.token_to_orig_map and \ + self.tokens == other.tokens and \ + self.unique_id == other.unique_id \ No newline at end of file From 72e506b22e90feab6c410136bacc27f3d65284b9 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 19 Nov 2019 09:49:55 -0500 Subject: [PATCH 025/302] wip --- examples/run_squad.py | 29 +++++- transformers/__init__.py | 3 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/squad.py | 122 +++++++++++++++++++++++ transformers/tokenization_utils.py | 4 + 6 files changed, 157 insertions(+), 5 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 69088d73c3..d4219c3096 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -23,7 +23,6 @@ import os import random import glob import timeit - import numpy as np import torch from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, @@ -45,7 +44,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, get_linear_schedule_with_warmup +from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples from utils_squad import (read_squad_examples, convert_examples_to_features, RawResult, write_predictions, @@ -309,6 +308,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) + + examples = examples[:10] features = convert_examples_to_features(examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, @@ -319,6 +320,30 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) + + exampless = sread_squad_examples(input_file=input_file, + is_training=not evaluate, + version_2_with_negative=args.version_2_with_negative) + exampless = exampless[:10] + features2 = squad_convert_examples_to_features(examples=exampless, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, + pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, + cls_token_at_end=True if args.model_type in ['xlnet'] else False, + sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) + + print(features2) + + for i in range(len(features)): + assert features[i] == features2[i] + print("Equal") + + print("DONE") + if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) diff --git a/transformers/__init__.py b/transformers/__init__.py index b859e18c53..9a767913b3 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -26,7 +26,8 @@ from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, - squad_convert_examples_to_features, SquadFeatures) + squad_convert_examples_to_features, SquadFeatures, + SquadExample, read_squad_examples) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 827d96ed29..50f2e768f4 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features +from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 4e322a2ca8..924b4a1245 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index c1a1034f17..1900e9f0ce 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -2,7 +2,9 @@ from tqdm import tqdm import collections import logging import os +import json +from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures from ...file_utils import is_tf_available @@ -11,6 +13,7 @@ if is_tf_available(): logger = logging.getLogger(__name__) + def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, cls_token_at_end=False, @@ -265,6 +268,125 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return features + +def read_squad_examples(input_file, is_training, version_2_with_negative): + """Read a SQuAD json file into a list of SquadExample.""" + with open(input_file, "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + + def is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + for c in paragraph_text: + if is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + if is_training: + if version_2_with_negative: + is_impossible = qa["is_impossible"] + if (len(qa["answers"]) != 1) and (not is_impossible): + raise ValueError( + "For training, each question should have exactly 1 answer.") + if not is_impossible: + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + end_position = char_to_word_offset[answer_offset + answer_length - 1] + # Only add answers where the text can be exactly recovered from the + # document. If this CAN'T happen it's likely due to weird Unicode + # stuff so we will just skip the example. + # + # Note that this means for training mode, every example is NOT + # guaranteed to be preserved. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join( + whitespace_tokenize(orig_answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", + actual_text, cleaned_answer_text) + continue + else: + start_position = -1 + end_position = -1 + orig_answer_text = "" + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + return examples + + +class SquadExample(object): + """ + A single training/test example for the Squad dataset. + For examples without an answer, the start and end position are -1. + """ + + def __init__(self, + qas_id, + question_text, + doc_tokens, + orig_answer_text=None, + start_position=None, + end_position=None, + is_impossible=None): + self.qas_id = qas_id + self.question_text = question_text + self.doc_tokens = doc_tokens + self.orig_answer_text = orig_answer_text + self.start_position = start_position + self.end_position = end_position + self.is_impossible = is_impossible + + def __str__(self): + return self.__repr__() + + def __repr__(self): + s = "" + s += "qas_id: %s" % (self.qas_id) + s += ", question_text: %s" % ( + self.question_text) + s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) + if self.start_position: + s += ", start_position: %d" % (self.start_position) + if self.end_position: + s += ", end_position: %d" % (self.end_position) + if self.is_impossible: + s += ", is_impossible: %r" % (self.is_impossible) + return s + + class SquadFeatures(object): """A single set of features of data.""" diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 4fa26a26f8..ba10e6b311 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -605,6 +605,10 @@ class PreTrainedTokenizer(object): vocabularies (BPE/SentencePieces/WordPieces). Take care of added tokens. + + text: The sequence to be encoded. + return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False). + **kwargs: passed to the child `self.tokenize()` method """ def split_on_token(tok, text): result = [] From 9f374c8252330bffd669c43749b5e937ed31d90a Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Fri, 22 Nov 2019 16:27:15 -0500 Subject: [PATCH 026/302] `encode` and `encode_plus` handle attention masks and padding --- .../tests/tokenization_tests_commons.py | 51 ++++++++++++ transformers/tokenization_utils.py | 77 ++++++++++++++++++- transformers/tokenization_xlnet.py | 1 + 3 files changed, 127 insertions(+), 2 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index fdaf8cc137..d5b70d5266 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -335,3 +335,54 @@ class CommonTestCases: special_tokens_mask = tokenizer.get_special_tokens_mask(encoded_sequence_w_special, already_has_special_tokens=True) self.assertEqual(len(special_tokens_mask), len(encoded_sequence_w_special)) self.assertEqual(special_tokens_mask_orig, special_tokens_mask) + + def test_padding_to_max_length(self): + tokenizer = self.get_tokenizer() + + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id + + # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length + padding_size == padded_sequence_length + assert encoded_sequence + [padding_idx] * padding_size == padded_sequence + + # Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence_length = len(padded_sequence) + assert sequence_length == padded_sequence_length + assert encoded_sequence == padded_sequence + + def test_encode_plus_with_padding(self): + tokenizer = self.get_tokenizer() + + sequence = "Sequence" + padding_size = 10 + padding_idx = tokenizer.pad_token_id + token_type_padding_idx = tokenizer.pad_token_type_id + + encoded_sequence = tokenizer.encode_plus(sequence, return_special_tokens_mask=True) + input_ids = encoded_sequence['input_ids'] + token_type_ids = encoded_sequence['token_type_ids'] + attention_mask = encoded_sequence['attention_mask'] + special_tokens_mask = encoded_sequence['special_tokens_mask'] + sequence_length = len(input_ids) + + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) + padded_input_ids = padded_sequence['input_ids'] + padded_token_type_ids = padded_sequence['token_type_ids'] + padded_attention_mask = padded_sequence['attention_mask'] + padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence_length = len(padded_input_ids) + + assert sequence_length + padding_size == padded_sequence_length + assert input_ids + [padding_idx] * padding_size == padded_input_ids + assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids + assert attention_mask + [0] * padding_size == padded_attention_mask + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask \ No newline at end of file diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index ba10e6b311..3214699e12 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -190,6 +190,11 @@ class PreTrainedTokenizer(object): """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ return self.convert_tokens_to_ids(self.pad_token) + @property + def pad_token_type_id(self): + """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + return self._pad_token_type_id + @property def cls_token_id(self): """ Id of the classification token in the vocabulary. E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ @@ -213,6 +218,7 @@ class PreTrainedTokenizer(object): self._pad_token = None self._cls_token = None self._mask_token = None + self._pad_token_type_id = 0 self._additional_special_tokens = [] self.max_len = max_len if max_len is not None else int(1e12) @@ -696,6 +702,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', + pad_to_max_length=False, return_tensors=None, **kwargs): """ @@ -722,6 +729,8 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -732,6 +741,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, + pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs) @@ -744,7 +754,12 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', + pad_to_max_length=False, return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False, **kwargs): """ Returns a dictionary containing the encoded sequence or sequence pair and additional informations: @@ -769,9 +784,37 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). **kwargs: passed to the `self.tokenize()` method + + Return: + A Dictionary of shape:: + + { + input_ids: list[int], + token_type_ids: list[int] if return_token_type_ids is True (default) + attention_mask: list[int] if return_attention_mask is True (default) + overflowing_tokens: list[int] if a ``max_length`` is specified and return_overflowing_tokens is True + num_truncated_tokens: int if a ``max_length`` is specified and return_overflowing_tokens is True + special_tokens_mask: list[int] if ``add_special_tokens`` if set to ``True`` and return_special_tokens_mask is True + } + + With the fields: + ``input_ids``: list of token ids to be fed to a model + ``token_type_ids``: list of token type ids to be fed to a model + ``attention_mask``: list of indices specifying which tokens should be attended to by the model + + ``overflowing_tokens``: list of overflowing tokens if a max length is specified. + ``num_truncated_tokens``: number of overflowing tokens a ``max_length`` is specified + ``special_tokens_mask``: if adding special tokens, this is a list of [0, 1], with 0 specifying special added + tokens and 1 specifying sequence tokens. """ def get_input_ids(text): @@ -790,13 +833,24 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, + pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - return_tensors=return_tensors) + return_tensors=return_tensors, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask) def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, - truncation_strategy='longest_first', return_tensors=None): + truncation_strategy='longest_first', + pad_to_max_length=False, + return_tensors=None, + return_token_type_ids=True, + return_attention_mask=True, + return_overflowing_tokens=False, + return_special_tokens_mask=False): """ Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It adds special tokens, truncates @@ -819,8 +873,14 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding index, up to their max length. If no max length is specified, no padding is done. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. + return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). + return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) + return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). + return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). Return: A Dictionary of shape:: @@ -883,6 +943,19 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) + if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length: + difference = max_length - len(encoded_inputs["input_ids"]) + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] += [1] * difference + + encoded_inputs["input_ids"] += [self.pad_token_id] * difference + elif return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + return encoded_inputs def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index a4f1a6e3ba..3ea71f4438 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -74,6 +74,7 @@ class XLNetTokenizer(PreTrainedTokenizer): self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + self._pad_token_type_id = 3 try: import sentencepiece as spm From a7dafe2f41222469797f1a67232961d67bd2e519 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 21 Nov 2019 11:30:40 -0500 Subject: [PATCH 027/302] Padding strategy (left and right) rather than boolean flag --- .../tests/tokenization_tests_commons.py | 43 +++++++++++--- transformers/tokenization_utils.py | 58 ++++++++++++++----- 2 files changed, 77 insertions(+), 24 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index d5b70d5266..40d68d0ab2 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -343,21 +343,33 @@ class CommonTestCases: padding_size = 10 padding_idx = tokenizer.pad_token_id - # Check that it correctly pads when a maximum length is specified along with the padding flag set to True + # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right') padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert encoded_sequence + [padding_idx] * padding_size == padded_sequence - # Check that nothing is done when a maximum length is not specified + # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, pad_to_max_length=True) + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left') padded_sequence_length = len(padded_sequence) - assert sequence_length == padded_sequence_length - assert encoded_sequence == padded_sequence + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + encoded_sequence == padded_sequence + + # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified + encoded_sequence = tokenizer.encode(sequence) + sequence_length = len(encoded_sequence) + padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right') + padded_sequence_right_length = len(padded_sequence_right) + padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left') + padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_right_length + assert encoded_sequence == padded_sequence_right + assert sequence_length == padded_sequence_left_length + assert encoded_sequence == padded_sequence_left def test_encode_plus_with_padding(self): tokenizer = self.get_tokenizer() @@ -374,7 +386,8 @@ class CommonTestCases: special_tokens_mask = encoded_sequence['special_tokens_mask'] sequence_length = len(input_ids) - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) + # Test right padding + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] @@ -385,4 +398,18 @@ class CommonTestCases: assert input_ids + [padding_idx] * padding_size == padded_input_ids assert token_type_ids + [token_type_padding_idx] * padding_size == padded_token_type_ids assert attention_mask + [0] * padding_size == padded_attention_mask - assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask \ No newline at end of file + assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask + + # Test left padding + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True) + padded_input_ids = padded_sequence['input_ids'] + padded_token_type_ids = padded_sequence['token_type_ids'] + padded_attention_mask = padded_sequence['attention_mask'] + padded_special_tokens_mask = padded_sequence['special_tokens_mask'] + padded_sequence_length = len(padded_input_ids) + + assert sequence_length + padding_size == padded_sequence_length + assert [padding_idx] * padding_size + input_ids == padded_input_ids + assert [token_type_padding_idx] * padding_size + token_type_ids == padded_token_type_ids + assert [0] * padding_size + attention_mask == padded_attention_mask + assert [1] * padding_size + special_tokens_mask == padded_special_tokens_mask \ No newline at end of file diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 3214699e12..dbbabd0e1a 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -702,7 +702,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, **kwargs): """ @@ -729,8 +729,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -741,7 +745,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - pad_to_max_length=pad_to_max_length, + padding_strategy=padding_strategy, return_tensors=return_tensors, **kwargs) @@ -754,7 +758,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -784,8 +788,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -833,7 +841,7 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, - pad_to_max_length=pad_to_max_length, + padding_strategy=padding_strategy, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, @@ -845,7 +853,7 @@ class PreTrainedTokenizer(object): def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', - pad_to_max_length=False, + padding_strategy=None, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -873,8 +881,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - pad_to_max_length: if set to `True`, the returned sequences will be padded according to the model's + padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's padding index, up to their max length. If no max length is specified, no padding is done. + The strategies are handled by the following strings: + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + Defaults to None: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -943,16 +955,30 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) - if pad_to_max_length and max_length and len(encoded_inputs["input_ids"]) < max_length: + if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length: difference = max_length - len(encoded_inputs["input_ids"]) - if return_attention_mask: - encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference - if return_token_type_ids: - encoded_inputs["token_type_ids"] += [self.pad_token_type_id] * difference - if return_special_tokens_mask: - encoded_inputs["special_tokens_mask"] += [1] * difference - encoded_inputs["input_ids"] += [self.pad_token_id] * difference + if padding_strategy == 'right': + if return_attention_mask: + encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference + if return_token_type_ids: + encoded_inputs["token_type_ids"] = encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference + + elif padding_strategy == 'left': + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) + if return_token_type_ids: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] + if return_special_tokens_mask: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] + + else: + raise ValueError("Invalid padding strategy:" + str(padding_strategy)) + elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) From a5a8a6175fb5cc1e993366add026ba06386bde10 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 21 Nov 2019 19:18:20 -0500 Subject: [PATCH 028/302] Works for BERT --- transformers/data/processors/squad.py | 507 ++++++++++++++++++++++---- 1 file changed, 432 insertions(+), 75 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 1900e9f0ce..a0f2408a16 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -3,6 +3,7 @@ import collections import logging import os import json +import numpy as np from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures @@ -13,10 +14,68 @@ if is_tf_available(): logger = logging.getLogger(__name__) +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, + orig_answer_text): + """Returns tokenized answer spans that better match the annotated answer.""" + tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) + + for new_start in range(input_start, input_end + 1): + for new_end in range(input_end, new_start - 1, -1): + text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + if text_span == tok_answer_text: + return (new_start, new_end) + + return (input_start, input_end) + +def _check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span.start + doc_span.length - 1 + if position < doc_span.start: + continue + if position > end: + continue + num_left_context = position - doc_span.start + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span.length + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + + +def _new_check_is_max_context(doc_spans, cur_span_index, position): + """Check if this is the 'max context' doc span for the token.""" + # if len(doc_spans) == 1: + # return True + best_score = None + best_span_index = None + for (span_index, doc_span) in enumerate(doc_spans): + end = doc_span["start"] + doc_span["length"] - 1 + if position < doc_span["start"]: + continue + if position > end: + continue + num_left_context = position - doc_span["start"] + num_right_context = end - position + score = min(num_left_context, num_right_context) + 0.01 * doc_span["length"] + if best_score is None or score > best_score: + best_score = score + best_span_index = span_index + + return cur_span_index == best_span_index + +def _is_whitespace(c): + if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: + return True + return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - cls_token_at_end=False, + cls_token_at_end=True, cls_token='[CLS]', sep_token='[SEP]', pad_token=0, sequence_a_segment_id=0, sequence_b_segment_id=1, cls_token_segment_id=0, pad_token_segment_id=0, @@ -24,57 +83,184 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" - # Defining helper methods - def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - + # Defining helper methods unique_id = 1000000000 features = [] + new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - query_tokens = tokenizer.tokenize(example.question_text) - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in example.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + if is_training: + # Get start and end position + answer_length = len(example.answer_text) + start_position = char_to_word_offset[example.start_position] + end_position = char_to_word_offset[example.start_position + answer_length - 1] + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + continue tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): + for (i, token) in enumerate(doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) + spans = [] + + truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + encoded_dict = tokenizer.encode_plus( + truncated_query, + all_doc_tokens, + max_length=max_seq_length, + padding_strategy='right', + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + return_overflowing_tokens=True, + truncation_strategy='only_second' + ) + + ids = encoded_dict['input_ids'] + print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = 0 + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: + + overflowing_tokens = encoded_dict['overflowing_tokens'] + + print("OVERFLOW", len(overflowing_tokens)) + + encoded_dict = tokenizer.encode_plus( + truncated_query, + overflowing_tokens, + max_length=max_seq_length, + return_overflowing_tokens=True, + padding_strategy='right', + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + truncation_strategy='only_second' + ) + + ids = encoded_dict['input_ids'] + print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + + # Length of the document without the query + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + # split_token_index = doc_span.start + i + # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] + + # is_max_context = _check_is_max_context(doc_spans, doc_span_index, + # split_token_index) + # token_is_max_context[len(tokens)] = is_max_context + # tokens.append(all_doc_tokens[split_token_index]) + + spans.append(encoded_dict) + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + print("new span", len(spans)) + for span in spans: + # Identify the position of the CLS token + cls_index = span['input_ids'].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = np.array(span['token_type_ids']) + + # Convert all SEP indices to '0' before inversion + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0 + + # Limit positive values to one + p_mask = 1 - np.minimum(p_mask, 1) + + # Set the CLS index to '0' + p_mask[cls_index] = 0 + + print("new features length", len(new_features)) + + new_features.append(NewSquadFeatures( + span['input_ids'], + span['attention_mask'], + span['token_type_ids'], + cls_index, + p_mask.tolist(), + + example_index=example_index, + unique_id=unique_id, + paragraph_len=span['paragraph_len'], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"] + )) + + unique_id += 1 + + # tokenize ... + query_tokens = tokenizer.tokenize(example.question_text) + + if len(query_tokens) > max_query_length: + query_tokens = query_tokens[0:max_query_length] + tok_start_position = None tok_end_position = None if is_training and example.is_impossible: @@ -82,7 +268,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_end_position = -1 if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: + if example.end_position < len(doc_tokens) - 1: tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 else: tok_end_position = len(all_doc_tokens) - 1 @@ -101,14 +287,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): + print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens)) length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): + print("Done with this doc span, breaking out.", start_offset, length) break + print("CHOOSING OFFSET", length, doc_stride) start_offset += min(length, doc_stride) + print("OLD DOC CREATION END", start_offset) + print("old span", len(doc_spans)) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} @@ -183,18 +374,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # tokens are attended to. input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. while len(input_ids) < max_seq_length: input_ids.append(pad_token) input_mask.append(0 if mask_padding_with_zero else 1) segment_ids.append(pad_token_segment_id) p_mask.append(1) - + print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None) assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length - span_is_impossible = example.is_impossible + span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False start_position = None end_position = None if is_training and not span_is_impossible: @@ -222,31 +415,32 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, start_position = cls_index end_position = cls_index - if example_index < 20: - logger.info("*** Example ***") - logger.info("unique_id: %s" % (unique_id)) - logger.info("example_index: %s" % (example_index)) - logger.info("doc_span_index: %s" % (doc_span_index)) - logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if is_training and span_is_impossible: - logger.info("impossible example") - if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) - logger.info("start_position: %d" % (start_position)) - logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) + # if example_index < 20: + # logger.info("*** Example ***") + # logger.info("unique_id: %s" % (unique_id)) + # logger.info("example_index: %s" % (example_index)) + # logger.info("doc_span_index: %s" % (doc_span_index)) + # logger.info("tokens: %s" % str(tokens)) + # logger.info("token_to_orig_map: %s" % " ".join([ + # "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) + # logger.info("token_is_max_context: %s" % " ".join([ + # "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() + # ])) + # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + # logger.info( + # "input_mask: %s" % " ".join([str(x) for x in input_mask])) + # logger.info( + # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + # if is_training and span_is_impossible: + # logger.info("impossible example") + # if is_training and not span_is_impossible: + # answer_text = " ".join(tokens[start_position:(end_position + 1)]) + # logger.info("start_position: %d" % (start_position)) + # logger.info("end_position: %d" % (end_position)) + # logger.info( + # "answer: %s" % (answer_text)) + print("features length", len(features)) features.append( SquadFeatures( unique_id=unique_id, @@ -266,7 +460,48 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, is_impossible=span_is_impossible)) unique_id += 1 - return features + assert len(features) == len(new_features) + + assert len(features) == len(new_features) + for i in range(len(features)): + print(i) + feature, new_feature = features[i], new_features[i] + + input_ids = feature.input_ids + input_mask = feature.input_mask + segment_ids = feature.segment_ids + cls_index = feature.cls_index + p_mask = feature.p_mask + example_index = feature.example_index + paragraph_len = feature.paragraph_len + token_is_max_context = feature.token_is_max_context + tokens = feature.tokens + token_to_orig_map = feature.token_to_orig_map + + new_input_ids = new_feature.input_ids + new_input_mask = new_feature.attention_mask + new_segment_ids = new_feature.token_type_ids + new_cls_index = new_feature.cls_index + new_p_mask = new_feature.p_mask + new_example_index = new_feature.example_index + new_paragraph_len = new_feature.paragraph_len + new_token_is_max_context = new_feature.token_is_max_context + new_tokens = new_feature.tokens + new_token_to_orig_map = new_feature.token_to_orig_map + + assert input_ids == new_input_ids + assert input_mask == new_input_mask + assert segment_ids == new_segment_ids + assert cls_index == new_cls_index + assert p_mask == new_p_mask + assert example_index == new_example_index + assert paragraph_len == new_paragraph_len + assert token_is_max_context == new_token_is_max_context + assert tokens == new_tokens + assert token_to_orig_map == new_token_to_orig_map + + + return new_features def read_squad_examples(input_file, is_training, version_2_with_negative): @@ -347,6 +582,124 @@ def read_squad_examples(input_file, is_training, version_2_with_negative): return examples +class SquadV1Processor(DataProcessor): + """Processor for the SQuAD data set.""" + + def get_example_from_tensor_dict(self, tensor_dict): + """See base class.""" + return NewSquadExample( + tensor_dict['id'].numpy(), + tensor_dict['question'].numpy().decode('utf-8'), + tensor_dict['context'].numpy().decode('utf-8'), + tensor_dict['answers']['text'].numpy().decode('utf-8'), + tensor_dict['answers']['answers_start'].numpy().decode('utf-8'), + tensor_dict['title'].numpy().decode('utf-8') + ) + + def get_train_examples(self, data_dir): + """See base class.""" + with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: + input_data = json.load(reader)["data"] + return self._create_examples(input_data, "dev") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, input_data, set_type): + """Creates examples for the training and dev sets.""" + + is_training = set_type == "train" + examples = [] + for entry in input_data: + title = entry['title'] + for paragraph in entry["paragraphs"]: + context_text = paragraph["context"] + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + answer_text = None + if is_training: + if (len(qa["answers"]) != 1): + raise ValueError( + "For training, each question should have exactly 1 answer.") + answer = qa["answers"][0] + answer_text = answer['text'] + start_position = answer['answer_start'] + + example = NewSquadExample( + qas_id=qas_id, + question_text=question_text, + context_text=context_text, + answer_text=answer_text, + start_position=start_position, + title=title + ) + examples.append(example) + return examples + + + +class NewSquadExample(object): + """ + A single training/test example for the Squad dataset, as loaded from disk. + """ + + def __init__(self, + qas_id, + question_text, + context_text, + answer_text, + start_position, + title): + self.qas_id = qas_id + self.question_text = question_text + self.context_text = context_text + self.answer_text = answer_text + self.start_position = start_position + self.title = title + + +class NewSquadFeatures(object): + """ + Single squad example features to be fed to a model. + Those features are model-specific. + """ + + def __init__(self, + input_ids, + attention_mask, + token_type_ids, + cls_index, + p_mask, + + example_index, + unique_id, + paragraph_len, + token_is_max_context, + tokens, + token_to_orig_map + ): + self.input_ids = input_ids + self.attention_mask = attention_mask + self.token_type_ids = token_type_ids + self.cls_index = cls_index + self.p_mask = p_mask + + self.example_index = example_index + self.unique_id = unique_id + self.paragraph_len = paragraph_len + self.token_is_max_context = token_is_max_context + self.tokens = tokens + self.token_to_orig_map = token_to_orig_map + class SquadExample(object): """ A single training/test example for the Squad dataset. @@ -423,18 +776,22 @@ class SquadFeatures(object): self.is_impossible = is_impossible def __eq__(self, other): - return self.cls_index == other.cls_index and \ - self.doc_span_index == other.doc_span_index and \ - self.end_position == other.end_position and \ - self.example_index == other.example_index and \ + print(self.example_index == other.example_index) + print(self.input_ids == other.input_ids) + print(self.input_mask == other.attention_mask) + print(self.p_mask == other.p_mask) + print(self.paragraph_len == other.paragraph_len) + print(self.segment_ids == other.token_type_ids) + print(self.token_is_max_context == other.token_is_max_context) + print(self.token_to_orig_map == other.token_to_orig_map) + print(self.tokens == other.tokens) + + return self.example_index == other.example_index and \ self.input_ids == other.input_ids and \ - self.input_mask == other.input_mask and \ - self.is_impossible == other.is_impossible and \ + self.input_mask == other.attention_mask and \ self.p_mask == other.p_mask and \ self.paragraph_len == other.paragraph_len and \ - self.segment_ids == other.segment_ids and \ - self.start_position == other.start_position and \ + self.segment_ids == other.token_type_ids and \ self.token_is_max_context == other.token_is_max_context and \ self.token_to_orig_map == other.token_to_orig_map and \ - self.tokens == other.tokens and \ - self.unique_id == other.unique_id \ No newline at end of file + self.tokens == other.tokens \ No newline at end of file From c3ba6452377f085d0f59e15b97ac247bca24367e Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 22 Nov 2019 14:36:49 -0500 Subject: [PATCH 029/302] Works for XLNet --- examples/run_squad.py | 38 ++++-------- transformers/data/processors/squad.py | 84 +++++++++++++-------------- 2 files changed, 50 insertions(+), 72 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index d4219c3096..634b566a46 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,6 +16,7 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function +from transformers.data.processors.squad import SquadV1Processor import argparse import logging @@ -46,8 +47,7 @@ from transformers import (WEIGHTS_NAME, BertConfig, from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples -from utils_squad import (read_squad_examples, convert_examples_to_features, - RawResult, write_predictions, +from utils_squad import (RawResult, write_predictions, RawResultExtended, write_predictions_extended) # The follwing import is the official SQuAD evaluation script (2.0). @@ -289,7 +289,6 @@ def evaluate(args, model, tokenizer, prefix=""): results = evaluate_on_squad(evaluate_options) return results - def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): if args.local_rank not in [-1, 0] and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache @@ -308,9 +307,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal examples = read_squad_examples(input_file=input_file, is_training=not evaluate, version_2_with_negative=args.version_2_with_negative) - - examples = examples[:10] - features = convert_examples_to_features(examples=examples, + keep_n_examples = 1000 + processor = SquadV1Processor() + values = processor.get_dev_examples("examples/squad") + examples = values[:keep_n_examples] + features = squad_convert_examples_to_features(examples=exampless, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, @@ -320,29 +321,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, cls_token_at_end=True if args.model_type in ['xlnet'] else False, sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - - exampless = sread_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - exampless = exampless[:10] - features2 = squad_convert_examples_to_features(examples=exampless, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate, - cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, - pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, - cls_token_at_end=True if args.model_type in ['xlnet'] else False, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - - print(features2) - - for i in range(len(features)): - assert features[i] == features2[i] - print("Equal") - print("DONE") + + import sys + sys.exit() if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index a0f2408a16..fb3d2ae4d4 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -83,6 +83,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" + cls_token = tokenizer.cls_token + sep_token = tokenizer.sep_token + # Defining helper methods unique_id = 1000000000 @@ -136,24 +139,24 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair encoded_dict = tokenizer.encode_plus( - truncated_query, - all_doc_tokens, + truncated_query if not sequence_a_is_doc else all_doc_tokens, + all_doc_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, return_overflowing_tokens=True, - truncation_strategy='only_second' + truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) ids = encoded_dict['input_ids'] - print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): - token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[0 + i] + index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + token_to_orig_map[index] = tok_to_orig_index[0 + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens @@ -164,35 +167,40 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, encoded_dict["length"] = paragraph_len spans.append(encoded_dict) - print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) + while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: - - overflowing_tokens = encoded_dict['overflowing_tokens'] - - print("OVERFLOW", len(overflowing_tokens)) - + overflowing_tokens = encoded_dict["overflowing_tokens"] encoded_dict = tokenizer.encode_plus( - truncated_query, - overflowing_tokens, + truncated_query if not sequence_a_is_doc else overflowing_tokens, + overflowing_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' + truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) - ids = encoded_dict['input_ids'] - print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) + + # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None) + # print(len(spans) * doc_stride, len(all_doc_tokens)) + # Length of the document without the query paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + if tokenizer.pad_token_id in encoded_dict['input_ids']: + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + else: + non_padded_ids = encoded_dict['input_ids'] + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): - token_to_orig_map[len(truncated_query) + sequence_added_tokens + i] = tok_to_orig_index[len(spans) * doc_stride + i] + index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len encoded_dict["tokens"] = tokens @@ -202,23 +210,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, encoded_dict["start"] = len(spans) * doc_stride encoded_dict["length"] = paragraph_len - # split_token_index = doc_span.start + i - # token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - # is_max_context = _check_is_max_context(doc_spans, doc_span_index, - # split_token_index) - # token_is_max_context[len(tokens)] = is_max_context - # tokens.append(all_doc_tokens[split_token_index]) - spans.append(encoded_dict) for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j spans[doc_span_index]["token_is_max_context"][index] = is_max_context - print("new span", len(spans)) for span in spans: # Identify the position of the CLS token cls_index = span['input_ids'].index(tokenizer.cls_token_id) @@ -227,17 +226,17 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # Original TF implem also keep the classification token (set to 0) (not sure why...) p_mask = np.array(span['token_type_ids']) - # Convert all SEP indices to '0' before inversion - p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 0 + p_mask = np.minimum(p_mask, 1) - # Limit positive values to one - p_mask = 1 - np.minimum(p_mask, 1) + if not sequence_a_is_doc: + # Limit positive values to one + p_mask = 1 - p_mask + + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 # Set the CLS index to '0' p_mask[cls_index] = 0 - print("new features length", len(new_features)) - new_features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], @@ -287,19 +286,15 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_spans = [] start_offset = 0 while start_offset < len(all_doc_tokens): - print("OLD DOC CREATION BEGIN", start_offset, len(all_doc_tokens)) length = len(all_doc_tokens) - start_offset if length > max_tokens_for_doc: length = max_tokens_for_doc + # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length) doc_spans.append(_DocSpan(start=start_offset, length=length)) if start_offset + length == len(all_doc_tokens): - print("Done with this doc span, breaking out.", start_offset, length) break - print("CHOOSING OFFSET", length, doc_stride) start_offset += min(length, doc_stride) - print("OLD DOC CREATION END", start_offset) - print("old span", len(doc_spans)) for (doc_span_index, doc_span) in enumerate(doc_spans): tokens = [] token_to_orig_map = {} @@ -382,7 +377,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, input_mask.append(0 if mask_padding_with_zero else 1) segment_ids.append(pad_token_segment_id) p_mask.append(1) - print("[OLD] Ids computed; position of the first padding", input_ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in input_ids else None) + assert len(input_ids) == max_seq_length assert len(input_mask) == max_seq_length assert len(segment_ids) == max_seq_length @@ -440,7 +435,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # logger.info( # "answer: %s" % (answer_text)) - print("features length", len(features)) features.append( SquadFeatures( unique_id=unique_id, @@ -464,10 +458,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, assert len(features) == len(new_features) for i in range(len(features)): - print(i) feature, new_feature = features[i], new_features[i] - input_ids = feature.input_ids + input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ] input_mask = feature.input_mask segment_ids = feature.segment_ids cls_index = feature.cls_index @@ -478,7 +471,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tokens = feature.tokens token_to_orig_map = feature.token_to_orig_map - new_input_ids = new_feature.input_ids + new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids] new_input_mask = new_feature.attention_mask new_segment_ids = new_feature.token_type_ids new_cls_index = new_feature.cls_index @@ -497,6 +490,9 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, assert example_index == new_example_index assert paragraph_len == new_paragraph_len assert token_is_max_context == new_token_is_max_context + + tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens] + assert tokens == new_tokens assert token_to_orig_map == new_token_to_orig_map From e0e55bc550a16289763b4f656790e30ed86e428f Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 22 Nov 2019 16:18:18 -0500 Subject: [PATCH 030/302] Manage training example & refactor the refactor --- transformers/data/processors/squad.py | 368 ++++---------------------- 1 file changed, 51 insertions(+), 317 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index fb3d2ae4d4..3d8f48c1bb 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -92,31 +92,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, features = [] new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - - # Split on whitespace so that different tokens may be attributed to their original position. - for c in example.context_text: - if _is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - if is_training: # Get start and end position answer_length = len(example.answer_text) - start_position = char_to_word_offset[example.start_position] - end_position = char_to_word_offset[example.start_position + answer_length - 1] + start_position = example.start_position + end_position = example.end_position # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) + actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) @@ -125,7 +108,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] - for (i, token) in enumerate(doc_tokens): + for (i, token) in enumerate(example.doc_tokens): orig_to_tok_index.append(len(all_doc_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: @@ -138,56 +121,19 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair - encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else all_doc_tokens, - all_doc_tokens if not sequence_a_is_doc else truncated_query, - max_length=max_seq_length, - padding_strategy='right', - stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - return_overflowing_tokens=True, - truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' - ) - - ids = encoded_dict['input_ids'] - non_padded_ids = ids[:ids.index(tokenizer.pad_token_id)] if tokenizer.pad_token_id in ids else ids - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) - - token_to_orig_map = {} - for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i - token_to_orig_map[index] = tok_to_orig_index[0 + i] - - encoded_dict["paragraph_len"] = paragraph_len - encoded_dict["tokens"] = tokens - encoded_dict["token_to_orig_map"] = token_to_orig_map - encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens - encoded_dict["token_is_max_context"] = {} - encoded_dict["start"] = 0 - encoded_dict["length"] = paragraph_len - - spans.append(encoded_dict) - # print("YESSIR", len(spans) * doc_stride < len(all_doc_tokens), "overflowing_tokens" in encoded_dict) - - while len(spans) * doc_stride < len(all_doc_tokens) and "overflowing_tokens" in encoded_dict: - overflowing_tokens = encoded_dict["overflowing_tokens"] + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else overflowing_tokens, - overflowing_tokens if not sequence_a_is_doc else truncated_query, + truncated_query if not sequence_a_is_doc else span_doc_tokens, + span_doc_tokens if not sequence_a_is_doc else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, padding_strategy='right', stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' ) - ids = encoded_dict['input_ids'] - # print("Ids computes; position of the first padding", ids.index(tokenizer.pad_token_id) if tokenizer.pad_token_id in ids else None) - # print(encoded_dict["input_ids"].index(tokenizer.pad_token_id) if tokenizer.pad_token_id in encoded_dict["input_ids"] else None) - # print(len(spans) * doc_stride, len(all_doc_tokens)) - - - # Length of the document without the query paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) if tokenizer.pad_token_id in encoded_dict['input_ids']: @@ -212,6 +158,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, spans.append(encoded_dict) + if "overflowing_tokens" not in encoded_dict: + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) @@ -254,249 +204,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 - # tokenize ... - query_tokens = tokenizer.tokenize(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - tok_start_position = None - tok_end_position = None - if is_training and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - # print("Start offset is", start_offset, len(all_doc_tokens), "length is", length) - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = [] - - # CLS token at the beginning - if not cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = 0 - - # XLNet: P SEP Q SEP CLS - # Others: CLS Q SEP P SEP - if not sequence_a_is_doc: - # Query - tokens += query_tokens - segment_ids += [sequence_a_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - # Paragraph - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - if not sequence_a_is_doc: - segment_ids.append(sequence_b_segment_id) - else: - segment_ids.append(sequence_a_segment_id) - p_mask.append(0) - paragraph_len = doc_span.length - - if sequence_a_is_doc: - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - tokens += query_tokens - segment_ids += [sequence_b_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_b_segment_id) - p_mask.append(1) - - # CLS token at the end - if cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = len(tokens) - 1 # Index of classification token - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(pad_token) - input_mask.append(0 if mask_padding_with_zero else 1) - segment_ids.append(pad_token_segment_id) - p_mask.append(1) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - span_is_impossible = example.is_impossible if hasattr(example, "is_impossible") else False - start_position = None - end_position = None - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - span_is_impossible = True - else: - if sequence_a_is_doc: - doc_offset = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if is_training and span_is_impossible: - start_position = cls_index - end_position = cls_index - - # if example_index < 20: - # logger.info("*** Example ***") - # logger.info("unique_id: %s" % (unique_id)) - # logger.info("example_index: %s" % (example_index)) - # logger.info("doc_span_index: %s" % (doc_span_index)) - # logger.info("tokens: %s" % str(tokens)) - # logger.info("token_to_orig_map: %s" % " ".join([ - # "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - # logger.info("token_is_max_context: %s" % " ".join([ - # "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - # ])) - # logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - # logger.info( - # "input_mask: %s" % " ".join([str(x) for x in input_mask])) - # logger.info( - # "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - # if is_training and span_is_impossible: - # logger.info("impossible example") - # if is_training and not span_is_impossible: - # answer_text = " ".join(tokens[start_position:(end_position + 1)]) - # logger.info("start_position: %d" % (start_position)) - # logger.info("end_position: %d" % (end_position)) - # logger.info( - # "answer: %s" % (answer_text)) - - features.append( - SquadFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - cls_index=cls_index, - p_mask=p_mask, - paragraph_len=paragraph_len, - start_position=start_position, - end_position=end_position, - is_impossible=span_is_impossible)) - unique_id += 1 - - assert len(features) == len(new_features) - - assert len(features) == len(new_features) - for i in range(len(features)): - feature, new_feature = features[i], new_features[i] - - input_ids = [f if f not in [3,4,5] else 0 for f in feature.input_ids ] - input_mask = feature.input_mask - segment_ids = feature.segment_ids - cls_index = feature.cls_index - p_mask = feature.p_mask - example_index = feature.example_index - paragraph_len = feature.paragraph_len - token_is_max_context = feature.token_is_max_context - tokens = feature.tokens - token_to_orig_map = feature.token_to_orig_map - - new_input_ids = [f if f not in [3,4,5] else 0 for f in new_feature.input_ids] - new_input_mask = new_feature.attention_mask - new_segment_ids = new_feature.token_type_ids - new_cls_index = new_feature.cls_index - new_p_mask = new_feature.p_mask - new_example_index = new_feature.example_index - new_paragraph_len = new_feature.paragraph_len - new_token_is_max_context = new_feature.token_is_max_context - new_tokens = new_feature.tokens - new_token_to_orig_map = new_feature.token_to_orig_map - - assert input_ids == new_input_ids - assert input_mask == new_input_mask - assert segment_ids == new_segment_ids - assert cls_index == new_cls_index - assert p_mask == new_p_mask - assert example_index == new_example_index - assert paragraph_len == new_paragraph_len - assert token_is_max_context == new_token_is_max_context - - tokens = [t if tokenizer.convert_tokens_to_ids(t) is not tokenizer.unk_token_id else tokenizer.unk_token for t in tokens] - - assert tokens == new_tokens - assert token_to_orig_map == new_token_to_orig_map - - return new_features @@ -592,35 +299,35 @@ class SquadV1Processor(DataProcessor): tensor_dict['title'].numpy().decode('utf-8') ) - def get_train_examples(self, data_dir): + def get_train_examples(self, data_dir, only_first=None): """See base class.""" with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "train") + return self._create_examples(input_data, "train", only_first) - def get_dev_examples(self, data_dir): + def get_dev_examples(self, data_dir, only_first=None): """See base class.""" with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "dev") + return self._create_examples(input_data, "dev", only_first) def get_labels(self): """See base class.""" return ["0", "1"] - def _create_examples(self, input_data, set_type): + def _create_examples(self, input_data, set_type, only_first=None): """Creates examples for the training and dev sets.""" is_training = set_type == "train" examples = [] - for entry in input_data: + for entry in tqdm(input_data): title = entry['title'] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] - start_position = None + start_position_character = None answer_text = None if is_training: if (len(qa["answers"]) != 1): @@ -628,17 +335,20 @@ class SquadV1Processor(DataProcessor): "For training, each question should have exactly 1 answer.") answer = qa["answers"][0] answer_text = answer['text'] - start_position = answer['answer_start'] + start_position_character = answer['answer_start'] example = NewSquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, answer_text=answer_text, - start_position=start_position, + start_position_character=start_position_character, title=title ) examples.append(example) + + if only_first is not None and len(examples) > only_first: + return examples return examples @@ -653,14 +363,38 @@ class NewSquadExample(object): question_text, context_text, answer_text, - start_position, + start_position_character, title): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text - self.start_position = start_position self.title = title + self.is_impossible = False + + doc_tokens = [] + char_to_word_offset = [] + prev_is_whitespace = True + + # Split on whitespace so that different tokens may be attributed to their original position. + for c in self.context_text: + if _is_whitespace(c): + prev_is_whitespace = True + else: + if prev_is_whitespace: + doc_tokens.append(c) + else: + doc_tokens[-1] += c + prev_is_whitespace = False + char_to_word_offset.append(len(doc_tokens) - 1) + + self.doc_tokens = doc_tokens + self.char_to_word_offset = char_to_word_offset + + # Start end end positions only has a value during evaluation. + if start_position_character is not None: + self.start_position = char_to_word_offset[start_position_character] + self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] class NewSquadFeatures(object): From 0669c1fcd15051ec6fe2d950079886faccf2fb33 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 25 Nov 2019 19:22:21 -0500 Subject: [PATCH 031/302] SQuAD v2 BERT + XLNet --- transformers/__init__.py | 2 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/squad.py | 180 +++++++++++------------ 4 files changed, 92 insertions(+), 94 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 9a767913b3..f3f81f1dbe 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -27,7 +27,7 @@ from .data import (is_sklearn_available, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, SquadFeatures, - SquadExample, read_squad_examples) + SquadExample) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 50f2e768f4..b351bf625e 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features, SquadExample, read_squad_examples +from .processors import squad_convert_examples_to_features, SquadExample from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 924b4a1245..1e52776629 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, read_squad_examples +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3d8f48c1bb..39ee00ae56 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -46,7 +46,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index - def _new_check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # if len(doc_spans) == 1: @@ -92,7 +91,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, features = [] new_features = [] for (example_index, example) in enumerate(tqdm(examples)): - if is_training: + if is_training and not example.is_impossible: # Get start and end position answer_length = len(example.answer_text) start_position = example.start_position @@ -105,6 +104,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue + tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] @@ -115,6 +115,18 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) @@ -187,6 +199,34 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # Set the CLS index to '0' p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if sequence_a_is_doc: + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + new_features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], @@ -199,7 +239,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, paragraph_len=span['paragraph_len'], token_is_max_context=span["token_is_max_context"], tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"] + token_to_orig_map=span["token_to_orig_map"], + + start_position=start_position, + end_position=end_position )) unique_id += 1 @@ -207,86 +250,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return new_features -def read_squad_examples(input_file, is_training, version_2_with_negative): - """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: - input_data = json.load(reader)["data"] - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - is_impossible = False - if is_training: - if version_2_with_negative: - is_impossible = qa["is_impossible"] - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue - else: - start_position = -1 - end_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position, - is_impossible=is_impossible) - examples.append(example) - return examples - - -class SquadV1Processor(DataProcessor): +class SquadProcessor(DataProcessor): """Processor for the SQuAD data set.""" + train_file = None + dev_file = None def get_example_from_tensor_dict(self, tensor_dict): """See base class.""" @@ -301,13 +268,19 @@ class SquadV1Processor(DataProcessor): def get_train_examples(self, data_dir, only_first=None): """See base class.""" - with open(os.path.join(data_dir, "train-v1.1.json"), "r", encoding='utf-8') as reader: + if self.train_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train", only_first) def get_dev_examples(self, data_dir, only_first=None): """See base class.""" - with open(os.path.join(data_dir, "dev-v1.1.json"), "r", encoding='utf-8') as reader: + if self.dev_file is None: + raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") + + with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev", only_first) @@ -329,7 +302,13 @@ class SquadV1Processor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None - if is_training: + + if "is_impossible" in qa: + is_impossible = qa["is_impossible"] + else: + is_impossible = False + + if not is_impossible and is_training: if (len(qa["answers"]) != 1): raise ValueError( "For training, each question should have exactly 1 answer.") @@ -343,15 +322,25 @@ class SquadV1Processor(DataProcessor): context_text=context_text, answer_text=answer_text, start_position_character=start_position_character, - title=title + title=title, + is_impossible=is_impossible ) + examples.append(example) if only_first is not None and len(examples) > only_first: return examples return examples - +class SquadV1Processor(SquadProcessor): + train_file = "train-v1.1.json" + dev_file = "dev-v1.1.json" + + +class SquadV2Processor(SquadProcessor): + train_file = "train-v2.0.json" + dev_file = "dev-v2.0.json" + class NewSquadExample(object): """ @@ -364,13 +353,16 @@ class NewSquadExample(object): context_text, answer_text, start_position_character, - title): + title, + is_impossible=False): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text self.title = title - self.is_impossible = False + self.is_impossible = is_impossible + + self.start_position, self.end_position = 0, 0 doc_tokens = [] char_to_word_offset = [] @@ -392,7 +384,7 @@ class NewSquadExample(object): self.char_to_word_offset = char_to_word_offset # Start end end positions only has a value during evaluation. - if start_position_character is not None: + if start_position_character is not None and not is_impossible: self.start_position = char_to_word_offset[start_position_character] self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] @@ -415,7 +407,10 @@ class NewSquadFeatures(object): paragraph_len, token_is_max_context, tokens, - token_to_orig_map + token_to_orig_map, + + start_position, + end_position ): self.input_ids = input_ids self.attention_mask = attention_mask @@ -430,6 +425,9 @@ class NewSquadFeatures(object): self.tokens = tokens self.token_to_orig_map = token_to_orig_map + self.start_position = start_position + self.end_position = end_position + class SquadExample(object): """ A single training/test example for the Squad dataset. From bd41e8292a4bd7db10eb036112019d93c50adcf5 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 16:03:56 -0500 Subject: [PATCH 032/302] Cleanup & Evaluation now works --- examples/run_squad.py | 44 +++++++++++---------------- transformers/data/processors/squad.py | 14 ++------- 2 files changed, 20 insertions(+), 38 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 634b566a46..545c3ad55a 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,7 +16,7 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function -from transformers.data.processors.squad import SquadV1Processor +from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor import argparse import logging @@ -45,9 +45,9 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer) -from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features, read_squad_examples as sread_squad_examples +from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features -from utils_squad import (RawResult, write_predictions, +from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions, RawResultExtended, write_predictions_extended) # The follwing import is the official SQuAD evaluation script (2.0). @@ -304,28 +304,20 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal features = torch.load(cached_features_file) else: logger.info("Creating features from dataset file at %s", input_file) - examples = read_squad_examples(input_file=input_file, - is_training=not evaluate, - version_2_with_negative=args.version_2_with_negative) - keep_n_examples = 1000 - processor = SquadV1Processor() - values = processor.get_dev_examples("examples/squad") - examples = values[:keep_n_examples] - features = squad_convert_examples_to_features(examples=exampless, - tokenizer=tokenizer, - max_seq_length=args.max_seq_length, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - is_training=not evaluate, - cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0, - pad_token_segment_id=3 if args.model_type in ['xlnet'] else 0, - cls_token_at_end=True if args.model_type in ['xlnet'] else False, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False) - print("DONE") - import sys - sys.exit() - + processor = SquadV2Processor() + examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad") + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + sequence_a_is_doc=True if args.model_type in ['xlnet'] else False + ) + + if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save(features, cached_features_file) @@ -335,8 +327,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if evaluate: diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 39ee00ae56..3d5a3eca80 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -74,26 +74,16 @@ def _is_whitespace(c): def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - cls_token_at_end=True, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, sequence_a_is_doc=False): """Loads a data file into a list of `InputBatch`s.""" - cls_token = tokenizer.cls_token - sep_token = tokenizer.sep_token - # Defining helper methods unique_id = 1000000000 features = [] - new_features = [] for (example_index, example) in enumerate(tqdm(examples)): if is_training and not example.is_impossible: # Get start and end position - answer_length = len(example.answer_text) start_position = example.start_position end_position = example.end_position @@ -227,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = tok_end_position - doc_start + doc_offset - new_features.append(NewSquadFeatures( + features.append(NewSquadFeatures( span['input_ids'], span['attention_mask'], span['token_type_ids'], @@ -247,7 +237,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 - return new_features + return features class SquadProcessor(DataProcessor): From f671997ef74199823db83ed7b43340764888e129 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:17:20 -0500 Subject: [PATCH 033/302] Interface with TFDS --- transformers/data/processors/squad.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3d5a3eca80..52c2c28add 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -246,16 +246,24 @@ class SquadProcessor(DataProcessor): dev_file = None def get_example_from_tensor_dict(self, tensor_dict): - """See base class.""" return NewSquadExample( - tensor_dict['id'].numpy(), + tensor_dict['id'].numpy().decode("utf-8"), tensor_dict['question'].numpy().decode('utf-8'), tensor_dict['context'].numpy().decode('utf-8'), - tensor_dict['answers']['text'].numpy().decode('utf-8'), - tensor_dict['answers']['answers_start'].numpy().decode('utf-8'), + tensor_dict['answers']['text'][0].numpy().decode('utf-8'), + tensor_dict['answers']['answer_start'][0].numpy(), tensor_dict['title'].numpy().decode('utf-8') ) + def get_examples_from_dataset(self, dataset): + """See base class.""" + + examples = [] + for tensor_dict in tqdm(dataset): + examples.append(self.get_example_from_tensor_dict(tensor_dict)) + + return examples + def get_train_examples(self, data_dir, only_first=None): """See base class.""" if self.train_file is None: From 0b84b9fd8a728ca46e4109aa38a11b25f87a09bf Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:38:52 -0500 Subject: [PATCH 034/302] Add processors to __init__ --- transformers/__init__.py | 2 +- transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index f3f81f1dbe..aefa3f1921 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -27,7 +27,7 @@ from .data import (is_sklearn_available, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, squad_convert_examples_to_features, SquadFeatures, - SquadExample) + SquadExample, SquadV1Processor, SquadV2Processor) if is_sklearn_available(): from .data import glue_compute_metrics diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index b351bf625e..ea3a4e9fbb 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,6 +1,6 @@ from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .processors import squad_convert_examples_to_features, SquadExample +from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor from .metrics import is_sklearn_available if is_sklearn_available(): diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 1e52776629..2470e7a06d 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ from .utils import InputExample, InputFeatures, DataProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features -from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample +from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor From 1e9ac5a7cfeb48ff6a1cf20e07941fc8c59b391d Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 28 Nov 2019 17:43:47 -0500 Subject: [PATCH 035/302] New -> normal --- transformers/data/processors/squad.py | 106 ++------------------------ 1 file changed, 5 insertions(+), 101 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 52c2c28add..f414d41925 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -217,7 +217,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = tok_end_position - doc_start + doc_offset - features.append(NewSquadFeatures( + features.append(SquadFeatures( span['input_ids'], span['attention_mask'], span['token_type_ids'], @@ -246,7 +246,7 @@ class SquadProcessor(DataProcessor): dev_file = None def get_example_from_tensor_dict(self, tensor_dict): - return NewSquadExample( + return SquadExample( tensor_dict['id'].numpy().decode("utf-8"), tensor_dict['question'].numpy().decode('utf-8'), tensor_dict['context'].numpy().decode('utf-8'), @@ -314,7 +314,7 @@ class SquadProcessor(DataProcessor): answer_text = answer['text'] start_position_character = answer['answer_start'] - example = NewSquadExample( + example = SquadExample( qas_id=qas_id, question_text=question_text, context_text=context_text, @@ -340,7 +340,7 @@ class SquadV2Processor(SquadProcessor): dev_file = "dev-v2.0.json" -class NewSquadExample(object): +class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. """ @@ -387,7 +387,7 @@ class NewSquadExample(object): self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] -class NewSquadFeatures(object): +class SquadFeatures(object): """ Single squad example features to be fed to a model. Those features are model-specific. @@ -425,99 +425,3 @@ class NewSquadFeatures(object): self.start_position = start_position self.end_position = end_position - -class SquadExample(object): - """ - A single training/test example for the Squad dataset. - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.end_position: - s += ", end_position: %d" % (self.end_position) - if self.is_impossible: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class SquadFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.cls_index = cls_index - self.p_mask = p_mask - self.paragraph_len = paragraph_len - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __eq__(self, other): - print(self.example_index == other.example_index) - print(self.input_ids == other.input_ids) - print(self.input_mask == other.attention_mask) - print(self.p_mask == other.p_mask) - print(self.paragraph_len == other.paragraph_len) - print(self.segment_ids == other.token_type_ids) - print(self.token_is_max_context == other.token_is_max_context) - print(self.token_to_orig_map == other.token_to_orig_map) - print(self.tokens == other.tokens) - - return self.example_index == other.example_index and \ - self.input_ids == other.input_ids and \ - self.input_mask == other.attention_mask and \ - self.p_mask == other.p_mask and \ - self.paragraph_len == other.paragraph_len and \ - self.segment_ids == other.token_type_ids and \ - self.token_is_max_context == other.token_is_max_context and \ - self.token_to_orig_map == other.token_to_orig_map and \ - self.tokens == other.tokens \ No newline at end of file From f3776df0f3daca86634862fe3ba7da6ae2b9a663 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 2 Dec 2019 15:47:00 +0100 Subject: [PATCH 036/302] WIP debugging --- transformers/modeling_t5.py | 61 +++++++++++++++++++++++++++---------- 1 file changed, 45 insertions(+), 16 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 2a74333d31..1bf55611a2 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -132,6 +132,21 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path): # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module) #################################################### +class T5LayerNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ Construct a layernorm module in the T5 style + No bias and no substraction of mean. + """ + super(T5LayerNorm, self).__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, x): + variance = x.pow(2).mean(-1, keepdim=True) + x = x / torch.sqrt(variance + self.variance_epsilon) + return self.weight * x + + class T5DenseReluDense(nn.Module): def __init__(self, config): super(T5DenseReluDense, self).__init__() @@ -151,7 +166,7 @@ class T5LayerFF(nn.Module): def __init__(self, config): super(T5LayerFF, self).__init__() self.DenseReluDense = T5DenseReluDense(config) - self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states): @@ -316,13 +331,14 @@ class T5Attention(nn.Module): cache[self.layer_id] = (k, v) # q = q / math.sqrt(dim_per_head) # No scaling in T5 - scores = torch.matmul(q, k.transpose(2, 3)) # (bs, n_heads, qlen, klen) + scores = torch.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) scores += position_bias + special_out = position_bias if mask is not None: scores += mask @@ -346,14 +362,14 @@ class T5Attention(nn.Module): outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) - return outputs + return outputs + (special_out,) class T5LayerSelfAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5LayerSelfAttention, self).__init__() self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None): @@ -363,16 +379,18 @@ class T5LayerSelfAttention(nn.Module): position_bias=position_bias, head_mask=head_mask) y = attention_output[0] + special_out = attention_output[-1] + attention_output = attention_output[:-1] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them - return outputs + return outputs + (special_out,) class T5LayerCrossAttention(nn.Module): def __init__(self, config, has_relative_attention_bias=False): super(T5LayerCrossAttention, self).__init__() self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias) - self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None): @@ -408,7 +426,8 @@ class T5Block(nn.Module): position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] - outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights + special_out = self_attention_outputs[-1] + outputs = self_attention_outputs[1:-1] # Keep self-attention outputs and relative position weights if not self.is_decoder: hidden_states = self.layer[1](hidden_states) @@ -423,7 +442,7 @@ class T5Block(nn.Module): hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them - return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) + return outputs + (special_out,) # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class T5PreTrainedModel(PreTrainedModel): @@ -438,8 +457,7 @@ class T5PreTrainedModel(PreTrainedModel): def _init_weights(self, module): """ Initialize the weights """ factor = self.config.initializer_factor # Used for testing weights initialization - if isinstance(module, nn.LayerNorm): - module.bias.data.zero_() + if isinstance(module, T5LayerNorm): module.weight.data.fill_(factor*1.0) elif isinstance(module, (T5Model, T5WithLMHeadModel)): # Mesh TensorFlow embeddings initialization @@ -478,7 +496,7 @@ class T5Stack(T5PreTrainedModel): self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0)) for i in range(config.num_layers)]) - self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon) + self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon) self.dropout = nn.Dropout(config.dropout_rate) self.init_weights() @@ -515,11 +533,11 @@ class T5Stack(T5PreTrainedModel): # Since attention_mask is 1.0 for positions we want to attend and 0.0 for # masked positions, this operation will create a tensor which is 0.0 for - # positions we want to attend and -10000.0 for masked positions. + # positions we want to attend and -1e9 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 if self.is_decoder: # If a 2D ou 3D attention mask is provided for the cross-attention @@ -530,7 +548,7 @@ class T5Stack(T5PreTrainedModel): encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: encoder_extended_attention_mask = None @@ -553,6 +571,8 @@ class T5Stack(T5PreTrainedModel): all_attentions = () position_bias = None encoder_decoder_position_bias = None + + hidden_states = self.dropout(hidden_states) for i, layer_module in enumerate(self.block): if self.output_hidden_states: all_hidden_states = all_hidden_states + (hidden_states,) @@ -564,6 +584,8 @@ class T5Stack(T5PreTrainedModel): encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i]) + if i == 0: + special_out = layer_outputs[-1] # layer_outputs is a tuple with: # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states = layer_outputs[0] @@ -588,7 +610,7 @@ class T5Stack(T5PreTrainedModel): outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) - return outputs # last-layer hidden state, (all hidden states), (all attentions) + return outputs + (special_out,) # last-layer hidden state, (all hidden states), (all attentions) T5_START_DOCSTRING = r""" The T5 model was proposed in @@ -707,9 +729,16 @@ class T5Model(T5PreTrainedModel): # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + encoder_attention_mask = kwargs_encoder.get("attention_mask", None) if encoder_hidden_states is None: encoder_inputs_ids = kwargs_encoder.pop("input_ids") hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + + if encoder_attention_mask is not None: + # Apply masking + encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states) + hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1) + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) encoder_hidden_states = encoder_outputs[0] else: @@ -719,7 +748,7 @@ class T5Model(T5PreTrainedModel): decoder_inputs_ids = kwargs_decoder.pop("input_ids") hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states - kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) + kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) return decoder_outputs + encoder_outputs From 285b1241e38cdafb6b0dadd1d1afc19493318074 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 15:00:49 -0500 Subject: [PATCH 037/302] Added SquadResult --- transformers/data/processors/squad.py | 71 +++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index f414d41925..afbe4270f5 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -425,3 +425,74 @@ class SquadFeatures(object): self.start_position = start_position self.end_position = end_position + + + +class SquadResult(object): + """ + Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. + + Args: + result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by + XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the + following accepted formats: + + `dict` output by a simple model: + { + "start_logits": int, + "end_logits": int, + "unique_id": string + } + `list` output by a simple model: + [start_logits, end_logits, unique_id] + + `dict` output by a complex model: + { + "start_top_log_probs": float, + "start_top_index": int, + "end_top_log_probs": float, + "end_top_index": int, + "cls_logits": int, + "unique_id": string + } + `list` output by a complex model: + [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id] + + See `run_squad.py` for an example. + """ + def __init__(self, result): + if isinstance(result, dict): + if "start_logits" in result and "end_logits" in result: + self.start_logits = result["start_logits"] + self.end_logits = result["end_logits"] + + elif "start_top_log_probs" in result and "start_top_index" in result: + self.start_top_log_probs = result["start_top_log_probs"] + self.start_top_index = result["start_top_index"] + self.end_top_log_probs = result["end_top_log_probs"] + self.end_top_index = result["end_top_index"] + self.cls_logits = result["cls_logits"] + + else: + raise ValueError("SquadResult instantiated with wrong values.") + + self.unique_id = result["unique_id"] + elif isinstance(result, list): + if len(result) == 3: + self.start_logits = result[0] + self.end_logits = result[1] + + elif len(result) == 6: + self.start_top_log_probs = result[0] + self.start_top_index = result[1] + self.end_top_log_probs = result[2] + self.end_top_index = result[3] + self.cls_logits = result[4] + + else: + raise ValueError("SquadResult instantiated with wrong values.") + + self.unique_id = result[-1] + + else: + raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.") From c835bc85c2f51f4da5eab4f1481a25b052bf6d61 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 15:28:16 -0500 Subject: [PATCH 038/302] Compute predictions --- transformers/data/metrics/squad_metrics.py | 335 +++++++++++++++++++++ 1 file changed, 335 insertions(+) create mode 100644 transformers/data/metrics/squad_metrics.py diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py new file mode 100644 index 0000000000..d4c5a8ec5b --- /dev/null +++ b/transformers/data/metrics/squad_metrics.py @@ -0,0 +1,335 @@ +import json +import logging +import math +import collections +from io import open +from tqdm import tqdm + +from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize + +logger = logging.getLogger(__name__) + + +def compute_predictions(all_examples, all_features, all_results, n_best_size, + max_answer_length, do_lower_case, output_prediction_file, + output_nbest_file, output_null_log_odds_file, verbose_logging, + version_2_with_negative, null_score_diff_threshold): + """Write final predictions to the json file and log-odds of null if needed.""" + logger.info("Writing predictions to: %s" % (output_prediction_file)) + logger.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + min_null_feature_index = 0 # the paragraph slice with min null score + null_start_logit = 0 # the start logit at the slice with min null score + null_end_logit = 0 # the end logit at the slice with min null score + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + start_indexes = _get_best_indexes(result.start_logits, n_best_size) + end_indexes = _get_best_indexes(result.end_logits, n_best_size) + # if we could have irrelevant answers, get the min score of irrelevant + if version_2_with_negative: + feature_null_score = result.start_logits[0] + result.end_logits[0] + if feature_null_score < score_null: + score_null = feature_null_score + min_null_feature_index = feature_index + null_start_logit = result.start_logits[0] + null_end_logit = result.end_logits[0] + for start_index in start_indexes: + for end_index in end_indexes: + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= len(feature.tokens): + continue + if end_index >= len(feature.tokens): + continue + if start_index not in feature.token_to_orig_map: + continue + if end_index not in feature.token_to_orig_map: + continue + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_logit=result.start_logits[start_index], + end_logit=result.end_logits[end_index])) + if version_2_with_negative: + prelim_predictions.append( + _PrelimPrediction( + feature_index=min_null_feature_index, + start_index=0, + end_index=0, + start_logit=null_start_logit, + end_logit=null_end_logit)) + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_logit + x.end_logit), + reverse=True) + + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_logit", "end_logit"]) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + if pred.start_index > 0: # this is a non-null prediction + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = " ".join(tok_tokens) + + # De-tokenize WordPieces that have been split off. + tok_text = tok_text.replace(" ##", "") + tok_text = tok_text.replace("##", "") + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) + if final_text in seen_predictions: + continue + + seen_predictions[final_text] = True + else: + final_text = "" + seen_predictions[final_text] = True + + nbest.append( + _NbestPrediction( + text=final_text, + start_logit=pred.start_logit, + end_logit=pred.end_logit)) + # if we didn't include the empty option in the n-best, include it + if version_2_with_negative: + if "" not in seen_predictions: + nbest.append( + _NbestPrediction( + text="", + start_logit=null_start_logit, + end_logit=null_end_logit)) + + # In very rare edge cases we could only have single null prediction. + # So we just create a nonce prediction in this case to avoid failure. + if len(nbest)==1: + nbest.insert(0, + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + + assert len(nbest) >= 1 + + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_logit + entry.end_logit) + if not best_non_null_entry: + if entry.text: + best_non_null_entry = entry + + probs = _compute_softmax(total_scores) + + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_logit"] = entry.start_logit + output["end_logit"] = entry.end_logit + nbest_json.append(output) + + assert len(nbest_json) >= 1 + + if not version_2_with_negative: + all_predictions[example.qas_id] = nbest_json[0]["text"] + else: + # predict "" iff the null score - the score of best non-null > threshold + score_diff = score_null - best_non_null_entry.start_logit - ( + best_non_null_entry.end_logit) + scores_diff_json[example.qas_id] = score_diff + if score_diff > null_score_diff_threshold: + all_predictions[example.qas_id] = "" + else: + all_predictions[example.qas_id] = best_non_null_entry.text + all_nbest_json[example.qas_id] = nbest_json + + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") + + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") + + return all_predictions + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs From de276de1c1a469a58a25383a35a239d02459a978 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 3 Dec 2019 17:15:51 -0500 Subject: [PATCH 039/302] Working evaluation --- examples/run_squad.py | 43 +- transformers/data/metrics/squad_metrics.py | 588 +++++++++++++++++---- transformers/data/processors/squad.py | 19 +- 3 files changed, 507 insertions(+), 143 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 545c3ad55a..b7952487dc 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -16,7 +16,8 @@ """ Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet).""" from __future__ import absolute_import, division, print_function -from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor +from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult +from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate import argparse import logging @@ -230,9 +231,11 @@ def evaluate(args, model, tokenizer, prefix=""): model.eval() batch = tuple(t.to(args.device) for t in batch) with torch.no_grad(): - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1] - } + inputs = { + 'input_ids': batch[0], + 'attention_mask': batch[1] + } + if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids example_indices = batch[3] @@ -244,18 +247,8 @@ def evaluate(args, model, tokenizer, prefix=""): for i, example_index in enumerate(example_indices): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - if args.model_type in ['xlnet', 'xlm']: - # XLNet uses a more complex post-processing procedure - result = RawResultExtended(unique_id = unique_id, - start_top_log_probs = to_list(outputs[0][i]), - start_top_index = to_list(outputs[1][i]), - end_top_log_probs = to_list(outputs[2][i]), - end_top_index = to_list(outputs[3][i]), - cls_logits = to_list(outputs[4][i])) - else: - result = RawResult(unique_id = unique_id, - start_logits = to_list(outputs[0][i]), - end_logits = to_list(outputs[1][i])) + + result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id]) all_results.append(result) evalTime = timeit.default_timer() - start_time @@ -271,22 +264,18 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - write_predictions_extended(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: - write_predictions(examples, features, all_results, args.n_best_size, + predictions = compute_predictions(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) - # Evaluate with the official SQuAD script - evaluate_options = EVAL_OPTS(data_file=args.predict_file, - pred_file=output_prediction_file, - na_prob_file=output_null_log_odds_file) - results = evaluate_on_squad(evaluate_options) + results = squad_evaluate(examples, predictions) return results def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False): @@ -306,8 +295,12 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Creating features from dataset file at %s", input_file) processor = SquadV2Processor() - examples = processor.get_dev_examples("examples/squad") if evaluate else processor.get_train_examples("examples/squad") - features = squad_convert_examples_to_features( + examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad") + # import tensorflow_datasets as tfds + # tfds_examples = tfds.load("squad") + # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"]) + + features = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index d4c5a8ec5b..83647a20d0 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -1,15 +1,323 @@ +""" Very heavily inspired by the official evaluation script for SQuAD version 2.0 which was +modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 + +In addition to basic functionality, we also compute additional statistics and +plot precision-recall curves if an additional na_prob.json file is provided. +This file is expected to map question ID's to the model's predicted probability +that a question is unanswerable. +""" + + import json import logging import math import collections from io import open from tqdm import tqdm +import string +import re from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize logger = logging.getLogger(__name__) +def normalize_answer(s): + """Lower text and remove punctuation, articles and extra whitespace.""" + def remove_articles(text): + regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) + return re.sub(regex, ' ', text) + + def white_space_fix(text): + return ' '.join(text.split()) + + def remove_punc(text): + exclude = set(string.punctuation) + return ''.join(ch for ch in text if ch not in exclude) + + def lower(text): + return text.lower() + return white_space_fix(remove_articles(remove_punc(lower(s)))) + + +def get_tokens(s): + if not s: + return [] + return normalize_answer(s).split() + + +def compute_exact(a_gold, a_pred): + return int(normalize_answer(a_gold) == normalize_answer(a_pred)) + + +def compute_f1(a_gold, a_pred): + gold_toks = get_tokens(a_gold) + pred_toks = get_tokens(a_pred) + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) + num_same = sum(common.values()) + if len(gold_toks) == 0 or len(pred_toks) == 0: + # If either is no-answer, then F1 is 1 if they agree, 0 otherwise + return int(gold_toks == pred_toks) + if num_same == 0: + return 0 + precision = 1.0 * num_same / len(pred_toks) + recall = 1.0 * num_same / len(gold_toks) + f1 = (2 * precision * recall) / (precision + recall) + return f1 + + +def get_raw_scores(examples, preds): + """ + Computes the exact and f1 scores from the examples and the model predictions + """ + exact_scores = {} + f1_scores = {} + + for example in examples: + qas_id = example.qas_id + gold_answers = [answer['text'] for answer in example.answers if normalize_answer(answer['text'])] + + if not gold_answers: + # For unanswerable questions, only correct answer is empty string + gold_answers = [''] + + if qas_id not in preds: + print('Missing prediction for %s' % qas_id) + continue + + prediction = preds[qas_id] + exact_scores[qas_id] = max(compute_exact(a, prediction) for a in gold_answers) + f1_scores[qas_id] = max(compute_f1(a, prediction) for a in gold_answers) + + return exact_scores, f1_scores + + +def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): + new_scores = {} + for qid, s in scores.items(): + pred_na = na_probs[qid] > na_prob_thresh + if pred_na: + new_scores[qid] = float(not qid_to_has_ans[qid]) + else: + new_scores[qid] = s + return new_scores + + +def make_eval_dict(exact_scores, f1_scores, qid_list=None): + if not qid_list: + total = len(exact_scores) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores.values()) / total), + ('f1', 100.0 * sum(f1_scores.values()) / total), + ('total', total), + ]) + else: + total = len(qid_list) + return collections.OrderedDict([ + ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), + ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), + ('total', total), + ]) + + +def merge_eval(main_eval, new_eval, prefix): + for k in new_eval: + main_eval['%s_%s' % (prefix, k)] = new_eval[k] + + +def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for _, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + return 100.0 * best_score / len(scores), best_thresh + + +def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) + + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + + +def squad_evaluate(examples, preds, no_answer_probs=None, no_answer_probability_threshold=1.0): + qas_id_to_has_answer = {example.qas_id: bool(example.answers) for example in examples} + has_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if has_answer] + no_answer_qids = [qas_id for qas_id, has_answer in qas_id_to_has_answer.items() if not has_answer] + + if no_answer_probs is None: + no_answer_probs = {k: 0.0 for k in preds} + + exact, f1 = get_raw_scores(examples, preds) + + exact_threshold = apply_no_ans_threshold(exact, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + f1_threshold = apply_no_ans_threshold(f1, no_answer_probs, qas_id_to_has_answer, no_answer_probability_threshold) + + evaluation = make_eval_dict(exact_threshold, f1_threshold) + + if has_answer_qids: + has_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=has_answer_qids) + merge_eval(evaluation, has_ans_eval, 'HasAns') + + if no_answer_qids: + no_ans_eval = make_eval_dict(exact_threshold, f1_threshold, qid_list=no_answer_qids) + merge_eval(evaluation, no_ans_eval, 'NoAns') + + if no_answer_probs: + find_all_best_thresh(evaluation, preds, exact, f1, no_answer_probs, qas_id_to_has_answer) + + return evaluation + + +def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): + """Project the tokenized prediction back to the original text.""" + + # When we created the data, we kept track of the alignment between original + # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So + # now `orig_text` contains the span of our original text corresponding to the + # span that we predicted. + # + # However, `orig_text` may contain extra characters that we don't want in + # our prediction. + # + # For example, let's say: + # pred_text = steve smith + # orig_text = Steve Smith's + # + # We don't want to return `orig_text` because it contains the extra "'s". + # + # We don't want to return `pred_text` because it's already been normalized + # (the SQuAD eval script also does punctuation stripping/lower casing but + # our tokenizer does additional normalization like stripping accent + # characters). + # + # What we really want to return is "Steve Smith". + # + # Therefore, we have to apply a semi-complicated alignment heuristic between + # `pred_text` and `orig_text` to get a character-to-character alignment. This + # can fail in certain cases in which case we just return `orig_text`. + + def _strip_spaces(text): + ns_chars = [] + ns_to_s_map = collections.OrderedDict() + for (i, c) in enumerate(text): + if c == " ": + continue + ns_to_s_map[len(ns_chars)] = i + ns_chars.append(c) + ns_text = "".join(ns_chars) + return (ns_text, ns_to_s_map) + + # We first tokenize `orig_text`, strip whitespace from the result + # and `pred_text`, and check if they are the same length. If they are + # NOT the same length, the heuristic has failed. If they are the same + # length, we assume the characters are one-to-one aligned. + tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + + tok_text = " ".join(tokenizer.tokenize(orig_text)) + + start_position = tok_text.find(pred_text) + if start_position == -1: + if verbose_logging: + logger.info( + "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) + return orig_text + end_position = start_position + len(pred_text) - 1 + + (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) + (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + + if len(orig_ns_text) != len(tok_ns_text): + if verbose_logging: + logger.info("Length not equal after stripping spaces: '%s' vs '%s'", + orig_ns_text, tok_ns_text) + return orig_text + + # We then project the characters in `pred_text` back to `orig_text` using + # the character-to-character alignment. + tok_s_to_ns_map = {} + for (i, tok_index) in tok_ns_to_s_map.items(): + tok_s_to_ns_map[tok_index] = i + + orig_start_position = None + if start_position in tok_s_to_ns_map: + ns_start_position = tok_s_to_ns_map[start_position] + if ns_start_position in orig_ns_to_s_map: + orig_start_position = orig_ns_to_s_map[ns_start_position] + + if orig_start_position is None: + if verbose_logging: + logger.info("Couldn't map start position") + return orig_text + + orig_end_position = None + if end_position in tok_s_to_ns_map: + ns_end_position = tok_s_to_ns_map[end_position] + if ns_end_position in orig_ns_to_s_map: + orig_end_position = orig_ns_to_s_map[ns_end_position] + + if orig_end_position is None: + if verbose_logging: + logger.info("Couldn't map end position") + return orig_text + + output_text = orig_text[orig_start_position:(orig_end_position + 1)] + return output_text + + +def _get_best_indexes(logits, n_best_size): + """Get the n-best logits from a list.""" + index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + + best_indexes = [] + for i in range(len(index_and_score)): + if i >= n_best_size: + break + best_indexes.append(index_and_score[i][0]) + return best_indexes + + +def _compute_softmax(scores): + """Compute softmax probability over raw logits.""" + if not scores: + return [] + + max_score = None + for score in scores: + if max_score is None or score > max_score: + max_score = score + + exp_scores = [] + total_sum = 0.0 + for score in scores: + x = math.exp(score - max_score) + exp_scores.append(x) + total_sum += x + + probs = [] + for score in exp_scores: + probs.append(score / total_sum) + return probs + + def compute_predictions(all_examples, all_features, all_results, n_best_size, max_answer_length, do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, verbose_logging, @@ -204,132 +512,192 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, return all_predictions -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): - """Project the tokenized prediction back to the original text.""" +def compute_predictions_extended(all_examples, all_features, all_results, n_best_size, + max_answer_length, output_prediction_file, + output_nbest_file, + output_null_log_odds_file, orig_data_file, + start_n_top, end_n_top, version_2_with_negative, + tokenizer, verbose_logging): + """ XLNet write prediction logic (more complex than Bert's). + Write final predictions to the json file and log-odds of null if needed. - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heuristic between - # `pred_text` and `orig_text` to get a character-to-character alignment. This - # can fail in certain cases in which case we just return `orig_text`. + Requires utils_squad_evaluate.py + """ + _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name + "PrelimPrediction", + ["feature_index", "start_index", "end_index", + "start_log_prob", "end_log_prob"]) - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": + _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name + "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) + + logger.info("Writing predictions to: %s", output_prediction_file) + # logger.info("Writing nbest to: %s" % (output_nbest_file)) + + example_index_to_features = collections.defaultdict(list) + for feature in all_features: + example_index_to_features[feature.example_index].append(feature) + + unique_id_to_result = {} + for result in all_results: + unique_id_to_result[result.unique_id] = result + + all_predictions = collections.OrderedDict() + all_nbest_json = collections.OrderedDict() + scores_diff_json = collections.OrderedDict() + + for (example_index, example) in enumerate(all_examples): + features = example_index_to_features[example_index] + + prelim_predictions = [] + # keep track of the minimum score of null start+end of position 0 + score_null = 1000000 # large and positive + + for (feature_index, feature) in enumerate(features): + result = unique_id_to_result[feature.unique_id] + + cur_null_score = result.cls_logits + + # if we could have irrelevant answers, get the min score of irrelevant + score_null = min(score_null, cur_null_score) + + for i in range(start_n_top): + for j in range(end_n_top): + start_log_prob = result.start_top_log_probs[i] + start_index = result.start_top_index[i] + + j_index = i * end_n_top + j + + end_log_prob = result.end_top_log_probs[j_index] + end_index = result.end_top_index[j_index] + + # We could hypothetically create invalid predictions, e.g., predict + # that the start of the span is in the question. We throw out all + # invalid predictions. + if start_index >= feature.paragraph_len - 1: + continue + if end_index >= feature.paragraph_len - 1: + continue + + if not feature.token_is_max_context.get(start_index, False): + continue + if end_index < start_index: + continue + length = end_index - start_index + 1 + if length > max_answer_length: + continue + + prelim_predictions.append( + _PrelimPrediction( + feature_index=feature_index, + start_index=start_index, + end_index=end_index, + start_log_prob=start_log_prob, + end_log_prob=end_log_prob)) + + prelim_predictions = sorted( + prelim_predictions, + key=lambda x: (x.start_log_prob + x.end_log_prob), + reverse=True) + + seen_predictions = {} + nbest = [] + for pred in prelim_predictions: + if len(nbest) >= n_best_size: + break + feature = features[pred.feature_index] + + # XLNet un-tokenizer + # Let's keep it simple for now and see if we need all this later. + # + # tok_start_to_orig_index = feature.tok_start_to_orig_index + # tok_end_to_orig_index = feature.tok_end_to_orig_index + # start_orig_pos = tok_start_to_orig_index[pred.start_index] + # end_orig_pos = tok_end_to_orig_index[pred.end_index] + # paragraph_text = example.paragraph_text + # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() + + # Previously used Bert untokenizer + tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] + orig_doc_start = feature.token_to_orig_map[pred.start_index] + orig_doc_end = feature.token_to_orig_map[pred.end_index] + orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # Clean whitespace + tok_text = tok_text.strip() + tok_text = " ".join(tok_text.split()) + orig_text = " ".join(orig_tokens) + + final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, + verbose_logging) + + if final_text in seen_predictions: continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = BasicTokenizer(do_lower_case=do_lower_case) + seen_predictions[final_text] = True - tok_text = " ".join(tokenizer.tokenize(orig_text)) + nbest.append( + _NbestPrediction( + text=final_text, + start_log_prob=pred.start_log_prob, + end_log_prob=pred.end_log_prob)) - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 + # In very rare edge cases we could have no valid predictions. So we + # just create a nonce prediction in this case to avoid failure. + if not nbest: + nbest.append( + _NbestPrediction(text="", start_log_prob=-1e6, + end_log_prob=-1e6)) - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) + total_scores = [] + best_non_null_entry = None + for entry in nbest: + total_scores.append(entry.start_log_prob + entry.end_log_prob) + if not best_non_null_entry: + best_non_null_entry = entry - if len(orig_ns_text) != len(tok_ns_text): - if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text + probs = _compute_softmax(total_scores) - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): - tok_s_to_ns_map[tok_index] = i + nbest_json = [] + for (i, entry) in enumerate(nbest): + output = collections.OrderedDict() + output["text"] = entry.text + output["probability"] = probs[i] + output["start_log_prob"] = entry.start_log_prob + output["end_log_prob"] = entry.end_log_prob + nbest_json.append(output) - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] + assert len(nbest_json) >= 1 + assert best_non_null_entry is not None - if orig_start_position is None: - if verbose_logging: - logger.info("Couldn't map start position") - return orig_text + score_diff = score_null + scores_diff_json[example.qas_id] = score_diff + # note(zhiliny): always predict best_non_null_entry + # and the evaluation script will search for the best threshold + all_predictions[example.qas_id] = best_non_null_entry.text - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] + all_nbest_json[example.qas_id] = nbest_json - if orig_end_position is None: - if verbose_logging: - logger.info("Couldn't map end position") - return orig_text + with open(output_prediction_file, "w") as writer: + writer.write(json.dumps(all_predictions, indent=4) + "\n") - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text + with open(output_nbest_file, "w") as writer: + writer.write(json.dumps(all_nbest_json, indent=4) + "\n") + if version_2_with_negative: + with open(output_null_log_odds_file, "w") as writer: + writer.write(json.dumps(scores_diff_json, indent=4) + "\n") -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) + with open(orig_data_file, "r", encoding='utf-8') as reader: + orig_data = json.load(reader)["data"] - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes + qid_to_has_ans = make_qid_to_has_ans(orig_data) + has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] + no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] + exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) + out_eval = {} + find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs + return out_eval diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index afbe4270f5..70dc9faf54 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -306,13 +306,13 @@ class SquadProcessor(DataProcessor): else: is_impossible = False - if not is_impossible and is_training: - if (len(qa["answers"]) != 1): - raise ValueError( - "For training, each question should have exactly 1 answer.") - answer = qa["answers"][0] - answer_text = answer['text'] - start_position_character = answer['answer_start'] + if not is_impossible: + if is_training: + answer = qa["answers"][0] + answer_text = answer['text'] + start_position_character = answer['answer_start'] + else: + answers = qa["answers"] example = SquadExample( qas_id=qas_id, @@ -321,7 +321,8 @@ class SquadProcessor(DataProcessor): answer_text=answer_text, start_position_character=start_position_character, title=title, - is_impossible=is_impossible + is_impossible=is_impossible, + answers=answers ) examples.append(example) @@ -352,6 +353,7 @@ class SquadExample(object): answer_text, start_position_character, title, + answers=None, is_impossible=False): self.qas_id = qas_id self.question_text = question_text @@ -359,6 +361,7 @@ class SquadExample(object): self.answer_text = answer_text self.title = title self.is_impossible = is_impossible + self.answers = answers self.start_position, self.end_position = 0, 0 From 9ddc3f1a1227fc9cbe4e5a5c20b21546e438dfb1 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 10:37:00 -0500 Subject: [PATCH 040/302] Naming update + XLNet/XLM evaluation --- examples/run_squad.py | 6 +- transformers/data/metrics/squad_metrics.py | 97 ++++++++++++++++++---- 2 files changed, 85 insertions(+), 18 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index b7952487dc..a9ef5c6ba2 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -17,7 +17,7 @@ from __future__ import absolute_import, division, print_function from transformers.data.processors.squad import SquadV1Processor, SquadV2Processor, SquadResult -from transformers.data.metrics.squad_metrics import compute_predictions, compute_predictions_extended, squad_evaluate +from transformers.data.metrics.squad_metrics import compute_predictions_logits, compute_predictions_log_probs, squad_evaluate import argparse import logging @@ -264,13 +264,13 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type in ['xlnet', 'xlm']: # XLNet uses a more complex post-processing procedure - predictions = compute_predictions_extended(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.predict_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: - predictions = compute_predictions(examples, features, all_results, args.n_best_size, + predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 83647a20d0..1f120d354a 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -125,6 +125,53 @@ def merge_eval(main_eval, new_eval, prefix): main_eval['%s_%s' % (prefix, k)] = new_eval[k] +def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): + num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) + cur_score = num_no_ans + best_score = cur_score + best_thresh = 0.0 + qid_list = sorted(na_probs, key=lambda k: na_probs[k]) + for i, qid in enumerate(qid_list): + if qid not in scores: + continue + if qid_to_has_ans[qid]: + diff = scores[qid] + else: + if preds[qid]: + diff = -1 + else: + diff = 0 + cur_score += diff + if cur_score > best_score: + best_score = cur_score + best_thresh = na_probs[qid] + + has_ans_score, has_ans_cnt = 0, 0 + for qid in qid_list: + if not qid_to_has_ans[qid]: + continue + has_ans_cnt += 1 + + if qid not in scores: + continue + has_ans_score += scores[qid] + + return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt + + +def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): + best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2( + preds, exact_raw, na_probs, qid_to_has_ans) + best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2( + preds, f1_raw, na_probs, qid_to_has_ans) + main_eval['best_exact'] = best_exact + main_eval['best_exact_thresh'] = exact_thresh + main_eval['best_f1'] = best_f1 + main_eval['best_f1_thresh'] = f1_thresh + main_eval['has_ans_exact'] = has_ans_exact + main_eval['has_ans_f1'] = has_ans_f1 + + def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) cur_score = num_no_ans @@ -318,10 +365,20 @@ def _compute_softmax(scores): return probs -def compute_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): +def compute_predictions_logits( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + verbose_logging, + version_2_with_negative, + null_score_diff_threshold +): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) logger.info("Writing nbest to: %s" % (output_nbest_file)) @@ -450,12 +507,12 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, text="", start_logit=null_start_logit, end_logit=null_end_logit)) - + # In very rare edge cases we could only have single null prediction. # So we just create a nonce prediction in this case to avoid failure. - if len(nbest)==1: + if len(nbest) == 1: nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) + _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) # In very rare edge cases we could have no valid predictions. So we # just create a nonce prediction in this case to avoid failure. @@ -512,12 +569,22 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, return all_predictions -def compute_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): +def compute_predictions_log_probs( + all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + output_prediction_file, + output_nbest_file, + output_null_log_odds_file, + orig_data_file, + start_n_top, + end_n_top, + version_2_with_negative, + tokenizer, + verbose_logging +): """ XLNet write prediction logic (more complex than Bert's). Write final predictions to the json file and log-odds of null if needed. @@ -526,7 +593,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) + "start_log_prob", "end_log_prob"]) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) @@ -609,7 +676,7 @@ def compute_predictions_extended(all_examples, all_features, all_results, n_best # XLNet un-tokenizer # Let's keep it simple for now and see if we need all this later. - # + # # tok_start_to_orig_index = feature.tok_start_to_orig_index # tok_end_to_orig_index = feature.tok_end_to_orig_index # start_orig_pos = tok_start_to_orig_index[pred.start_index] From bf119c0568dfc1ea5ce0a34359e33ca002266e96 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 11:34:59 -0500 Subject: [PATCH 041/302] TFDS dataset can now be evaluated --- transformers/data/processors/squad.py | 34 ++++++++++++++++++++------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 70dc9faf54..2e50ac8a8c 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -245,22 +245,37 @@ class SquadProcessor(DataProcessor): train_file = None dev_file = None - def get_example_from_tensor_dict(self, tensor_dict): + def get_example_from_tensor_dict(self, tensor_dict, evaluate=False): + + if not evaluate: + answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') + answer_start = tensor_dict['answers']['answer_start'][0].numpy() + answers = None + else: + answers = [{ + "answer_start": start.numpy(), + "text": text.numpy().decode('utf-8') + } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])] + + answer = None + answer_start = None + return SquadExample( - tensor_dict['id'].numpy().decode("utf-8"), - tensor_dict['question'].numpy().decode('utf-8'), - tensor_dict['context'].numpy().decode('utf-8'), - tensor_dict['answers']['text'][0].numpy().decode('utf-8'), - tensor_dict['answers']['answer_start'][0].numpy(), - tensor_dict['title'].numpy().decode('utf-8') + qas_id=tensor_dict['id'].numpy().decode("utf-8"), + question_text=tensor_dict['question'].numpy().decode('utf-8'), + context_text=tensor_dict['context'].numpy().decode('utf-8'), + answer_text=answer, + start_position_character=answer_start, + title=tensor_dict['title'].numpy().decode('utf-8'), + answers=answers ) - def get_examples_from_dataset(self, dataset): + def get_examples_from_dataset(self, dataset, evaluate=False): """See base class.""" examples = [] for tensor_dict in tqdm(dataset): - examples.append(self.get_example_from_tensor_dict(tensor_dict)) + examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples @@ -300,6 +315,7 @@ class SquadProcessor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None + answers = None if "is_impossible" in qa: is_impossible = qa["is_impossible"] From cca75e788485e8a2a1c44a445c6aba0fb2dfaf56 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 15:42:29 -0500 Subject: [PATCH 042/302] Kill the demon spawn --- examples/run_squad.py | 23 +++++++- transformers/data/processors/squad.py | 75 +++++---------------------- 2 files changed, 34 insertions(+), 64 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index a9ef5c6ba2..2f86322196 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -248,7 +248,28 @@ def evaluate(args, model, tokenizer, prefix=""): eval_feature = features[example_index.item()] unique_id = int(eval_feature.unique_id) - result = SquadResult([to_list(output[i]) for output in outputs] + [unique_id]) + output = [to_list(output[i]) for output in outputs] + + if len(output) >= 5: + start_logits = output[0] + start_top_index = output[1] + end_logits = output[2] + end_top_index = output[3], + cls_logits = output[4] + + result = SquadResult( + unique_id, start_logits, end_logits, + start_top_index=start_top_index, + end_top_index=end_top_index, + cls_logits=cls_logits + ) + + else: + start_logits, end_logits = output + result = SquadResult( + unique_id, start_logits, end_logits + ) + all_results.append(result) evalTime = timeit.default_timer() - start_time diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 2e50ac8a8c..9306189eb4 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -446,72 +446,21 @@ class SquadFeatures(object): self.end_position = end_position - class SquadResult(object): """ Constructs a SquadResult which can be used to evaluate a model's output on the SQuAD dataset. Args: - result: The result output by a model on a SQuAD inference. These results may be complex (5 values) as the ones output by - XLNet or XLM or may be simple like the other models (2 values). They may be passed as a list or as a dict, with the - following accepted formats: - - `dict` output by a simple model: - { - "start_logits": int, - "end_logits": int, - "unique_id": string - } - `list` output by a simple model: - [start_logits, end_logits, unique_id] - - `dict` output by a complex model: - { - "start_top_log_probs": float, - "start_top_index": int, - "end_top_log_probs": float, - "end_top_index": int, - "cls_logits": int, - "unique_id": string - } - `list` output by a complex model: - [start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits, unique_id] - - See `run_squad.py` for an example. + unique_id: The unique identifier corresponding to that example. + start_logits: The logits corresponding to the start of the answer + end_logits: The logits corresponding to the end of the answer """ - def __init__(self, result): - if isinstance(result, dict): - if "start_logits" in result and "end_logits" in result: - self.start_logits = result["start_logits"] - self.end_logits = result["end_logits"] - - elif "start_top_log_probs" in result and "start_top_index" in result: - self.start_top_log_probs = result["start_top_log_probs"] - self.start_top_index = result["start_top_index"] - self.end_top_log_probs = result["end_top_log_probs"] - self.end_top_index = result["end_top_index"] - self.cls_logits = result["cls_logits"] - - else: - raise ValueError("SquadResult instantiated with wrong values.") - - self.unique_id = result["unique_id"] - elif isinstance(result, list): - if len(result) == 3: - self.start_logits = result[0] - self.end_logits = result[1] - - elif len(result) == 6: - self.start_top_log_probs = result[0] - self.start_top_index = result[1] - self.end_top_log_probs = result[2] - self.end_top_index = result[3] - self.cls_logits = result[4] - - else: - raise ValueError("SquadResult instantiated with wrong values.") - - self.unique_id = result[-1] - - else: - raise ValueError("SquadResult instantiated with wrong values. Should be a dictionary or a list.") + def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): + self.start_top_log_probs = start_logits + self.end_top_log_probs = end_logits + self.unique_id = unique_id + + if start_top_index: + self.start_top_index = start_top_index + self.end_top_index = end_top_index + self.cls_logits = cls_logits \ No newline at end of file From a7ca6d738b7801c680bd25d9e910f962d3f8bf2d Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 15:43:34 -0500 Subject: [PATCH 043/302] Padding side is tokenizer-dependant --- transformers/data/processors/squad.py | 11 ++-- .../tests/tokenization_tests_commons.py | 21 +++++-- transformers/tokenization_utils.py | 60 ++++++++++++------- transformers/tokenization_xlnet.py | 1 + 4 files changed, 58 insertions(+), 35 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 9306189eb4..6599c54330 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -73,8 +73,7 @@ def _is_whitespace(c): return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - sequence_a_is_doc=False): + doc_stride, max_query_length, is_training): """Loads a data file into a list of `InputBatch`s.""" # Defining helper methods @@ -127,13 +126,13 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, while len(spans) * doc_stride < len(all_doc_tokens): encoded_dict = tokenizer.encode_plus( - truncated_query if not sequence_a_is_doc else span_doc_tokens, - span_doc_tokens if not sequence_a_is_doc else truncated_query, + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, max_length=max_seq_length, return_overflowing_tokens=True, - padding_strategy='right', + pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if not sequence_a_is_doc else 'only_first' + truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' ) paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index 40d68d0ab2..6592005c67 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -344,17 +344,19 @@ class CommonTestCases: padding_idx = tokenizer.pad_token_id # RIGHT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "right" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='right') + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert encoded_sequence + [padding_idx] * padding_size == padded_sequence # LEFT PADDING - Check that it correctly pads when a maximum length is specified along with the padding flag set to True + tokenizer.padding_side = "left" encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, padding_strategy='left') + padded_sequence = tokenizer.encode(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True) padded_sequence_length = len(padded_sequence) assert sequence_length + padding_size == padded_sequence_length assert [padding_idx] * padding_size + encoded_sequence == padded_sequence @@ -362,10 +364,15 @@ class CommonTestCases: # RIGHT & LEFT PADDING - Check that nothing is done when a maximum length is not specified encoded_sequence = tokenizer.encode(sequence) sequence_length = len(encoded_sequence) - padded_sequence_right = tokenizer.encode(sequence, padding_strategy='right') + + tokenizer.padding_side = "right" + padded_sequence_right = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_right_length = len(padded_sequence_right) - padded_sequence_left = tokenizer.encode(sequence, padding_strategy='left') + + tokenizer.padding_side = "left" + padded_sequence_left = tokenizer.encode(sequence, pad_to_max_length=True) padded_sequence_left_length = len(padded_sequence_left) + assert sequence_length == padded_sequence_right_length assert encoded_sequence == padded_sequence_right assert sequence_length == padded_sequence_left_length @@ -387,7 +394,8 @@ class CommonTestCases: sequence_length = len(input_ids) # Test right padding - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='right', return_special_tokens_mask=True) + tokenizer.padding_side = "right" + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] @@ -401,7 +409,8 @@ class CommonTestCases: assert special_tokens_mask + [1] * padding_size == padded_special_tokens_mask # Test left padding - padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, padding_strategy='left', return_special_tokens_mask=True) + tokenizer.padding_side = "left" + padded_sequence = tokenizer.encode_plus(sequence, max_length=sequence_length + padding_size, pad_to_max_length=True, return_special_tokens_mask=True) padded_input_ids = padded_sequence['input_ids'] padded_token_type_ids = padded_sequence['token_type_ids'] padded_attention_mask = padded_sequence['attention_mask'] diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index dbbabd0e1a..41a611ea49 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -77,6 +77,8 @@ class PreTrainedTokenizer(object): "pad_token", "cls_token", "mask_token", "additional_special_tokens"] + padding_side = "right" + @property def bos_token(self): """ Beginning of sentence token (string). Log an error if used while not having been set. """ @@ -223,6 +225,9 @@ class PreTrainedTokenizer(object): self.max_len = max_len if max_len is not None else int(1e12) + # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed. + self.padding_side = kwargs.pop('padding_side', self.padding_side) + # Added tokens self.added_tokens_encoder = {} self.added_tokens_decoder = {} @@ -702,7 +707,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, **kwargs): """ @@ -729,12 +734,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences - Defaults to None: no padding. + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. **kwargs: passed to the `self.tokenize()` method @@ -745,7 +750,7 @@ class PreTrainedTokenizer(object): add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, - padding_strategy=padding_strategy, + pad_to_max_length=pad_to_max_length, return_tensors=return_tensors, **kwargs) @@ -758,7 +763,7 @@ class PreTrainedTokenizer(object): max_length=None, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -788,12 +793,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - 'right': pads on the right of the sequences - Defaults to None: no padding. + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -841,7 +846,7 @@ class PreTrainedTokenizer(object): return self.prepare_for_model(first_ids, pair_ids=second_ids, max_length=max_length, - padding_strategy=padding_strategy, + pad_to_max_length=pad_to_max_length, add_special_tokens=add_special_tokens, stride=stride, truncation_strategy=truncation_strategy, @@ -853,7 +858,7 @@ class PreTrainedTokenizer(object): def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', - padding_strategy=None, + pad_to_max_length=False, return_tensors=None, return_token_type_ids=True, return_attention_mask=True, @@ -881,12 +886,12 @@ class PreTrainedTokenizer(object): - 'only_first': Only truncate the first sequence - 'only_second': Only truncate the second sequence - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) - padding_strategy: if set to a strategy, the returned sequences will be padded according to the model's - padding index, up to their max length. If no max length is specified, no padding is done. - The strategies are handled by the following strings: + pad_to_max_length: if set to True, the returned sequences will be padded according to the model's padding side and + padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. + The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences - Defaults to None: no padding. + - 'right': pads on the right of the sequences + Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). @@ -955,10 +960,19 @@ class PreTrainedTokenizer(object): "for this model ({} > {}). Running this sequence through the model will result in " "indexing errors".format(len(ids), self.max_len)) - if padding_strategy is not None and max_length and len(encoded_inputs["input_ids"]) < max_length: - difference = max_length - len(encoded_inputs["input_ids"]) + needs_to_be_padded = pad_to_max_length and ( + max_length and len(encoded_inputs["input_ids"]) < max_length + or + max_length is None and len(encoded_inputs["input_ids"]) < self.max_len and self.max_len <= 10000 + ) - if padding_strategy == 'right': + if pad_to_max_length and max_length is None and self.max_len > 10000: + logger.warning("Sequence can't be padded as the maximum ") + + if needs_to_be_padded: + difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) + + if self.padding_side == 'right': if return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) + [0] * difference if return_token_type_ids: @@ -967,7 +981,7 @@ class PreTrainedTokenizer(object): encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif padding_strategy == 'left': + elif self.padding_side == 'left': if return_attention_mask: encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: @@ -977,7 +991,7 @@ class PreTrainedTokenizer(object): encoded_inputs["input_ids"] = [self.pad_token_id] * difference + encoded_inputs["input_ids"] else: - raise ValueError("Invalid padding strategy:" + str(padding_strategy)) + raise ValueError("Invalid padding strategy:" + str(self.padding_side)) elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) diff --git a/transformers/tokenization_xlnet.py b/transformers/tokenization_xlnet.py index 3ea71f4438..1c43c0943a 100644 --- a/transformers/tokenization_xlnet.py +++ b/transformers/tokenization_xlnet.py @@ -60,6 +60,7 @@ class XLNetTokenizer(PreTrainedTokenizer): vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + padding_side = "left" def __init__(self, vocab_file, do_lower_case=False, remove_space=True, keep_accents=False, From f7e4a7cdfa6bcf6ec7c33fd1d40d307278b1c13a Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 16:24:15 -0500 Subject: [PATCH 044/302] Cleanup --- examples/run_squad.py | 32 ++-- examples/test_examples.py | 3 +- .../{dev-v2.0-small.json => dev-v2.0.json} | 0 examples/tests_samples/SQUAD/train-v2.0.json | 140 ++++++++++++++++++ transformers/data/metrics/squad_metrics.py | 4 +- transformers/data/processors/squad.py | 36 ++++- 6 files changed, 191 insertions(+), 24 deletions(-) rename examples/tests_samples/SQUAD/{dev-v2.0-small.json => dev-v2.0.json} (100%) create mode 100644 examples/tests_samples/SQUAD/train-v2.0.json diff --git a/examples/run_squad.py b/examples/run_squad.py index 2f86322196..3f1b6a798f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -304,8 +304,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache # Load data features from cache or dataset file - input_file = args.predict_file if evaluate else args.train_file - cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format( + input_dir = args.data_dir if args.data_dir else "." + cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length))) @@ -313,13 +313,22 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal logger.info("Loading features from cached file %s", cached_features_file) features = torch.load(cached_features_file) else: - logger.info("Creating features from dataset file at %s", input_file) + logger.info("Creating features from dataset file at %s", input_dir) - processor = SquadV2Processor() - examples = processor.get_dev_examples("examples/squad", only_first=100) if evaluate else processor.get_train_examples("examples/squad") - # import tensorflow_datasets as tfds - # tfds_examples = tfds.load("squad") - # examples = SquadV1Processor().get_examples_from_dataset(tfds_examples["validation"]) + if not args.data_dir: + try: + import tensorflow_datasets as tfds + except ImportError: + raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.") + + if args.version_2_with_negative: + logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.") + + tfds_examples = tfds.load("squad") + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) + else: + processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() + examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = squad_convert_examples_to_features( examples=examples, @@ -328,7 +337,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, - sequence_a_is_doc=True if args.model_type in ['xlnet'] else False ) @@ -365,10 +373,6 @@ def main(): parser = argparse.ArgumentParser() ## Required parameters - parser.add_argument("--train_file", default=None, type=str, required=True, - help="SQuAD json for training. E.g., train-v1.1.json") - parser.add_argument("--predict_file", default=None, type=str, required=True, - help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json") parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, @@ -377,6 +381,8 @@ def main(): help="The output directory where the model checkpoints and predictions will be written.") ## Other parameters + parser.add_argument("--data_dir", default=None, type=str, + help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.") parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, diff --git a/examples/test_examples.py b/examples/test_examples.py index b04d722b7b..632d2f728e 100644 --- a/examples/test_examples.py +++ b/examples/test_examples.py @@ -72,8 +72,7 @@ class ExamplesTests(unittest.TestCase): logger.addHandler(stream_handler) testargs = ["run_squad.py", - "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", - "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json", + "--data_dir=./examples/tests_samples/SQUAD", "--model_name=bert-base-uncased", "--output_dir=./examples/tests_samples/temp_dir", "--max_steps=10", diff --git a/examples/tests_samples/SQUAD/dev-v2.0-small.json b/examples/tests_samples/SQUAD/dev-v2.0.json similarity index 100% rename from examples/tests_samples/SQUAD/dev-v2.0-small.json rename to examples/tests_samples/SQUAD/dev-v2.0.json diff --git a/examples/tests_samples/SQUAD/train-v2.0.json b/examples/tests_samples/SQUAD/train-v2.0.json new file mode 100644 index 0000000000..834d9ee660 --- /dev/null +++ b/examples/tests_samples/SQUAD/train-v2.0.json @@ -0,0 +1,140 @@ +{ + "version": "v2.0", + "data": [{ + "title": "Normans", + "paragraphs": [{ + "qas": [{ + "question": "In what country is Normandy located?", + "id": "56ddde6b9a695914005b9628", + "answers": [{ + "text": "France", + "answer_start": 159 + }], + "is_impossible": false + }, { + "question": "When were the Normans in Normandy?", + "id": "56ddde6b9a695914005b9629", + "answers": [{ + "text": "10th and 11th centuries", + "answer_start": 94 + }], + "is_impossible": false + }, { + "question": "From which countries did the Norse originate?", + "id": "56ddde6b9a695914005b962a", + "answers": [{ + "text": "Denmark, Iceland and Norway", + "answer_start": 256 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "Rollo", + "answer_start": 308 + }], + "question": "Who did King Charles III swear fealty to?", + "id": "5ad39d53604f3c001a3fe8d3", + "answers": [], + "is_impossible": true + }, { + "plausible_answers": [{ + "text": "10th century", + "answer_start": 671 + }], + "question": "When did the Frankish identity emerge?", + "id": "5ad39d53604f3c001a3fe8d4", + "answers": [], + "is_impossible": true + }], + "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries." + }, { + "qas": [{ + "question": "Who was the duke in the battle of Hastings?", + "id": "56dddf4066d3e219004dad5f", + "answers": [{ + "text": "William the Conqueror", + "answer_start": 1022 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "Antioch", + "answer_start": 1295 + }], + "question": "What principality did William the conquerer found?", + "id": "5ad3a266604f3c001a3fea2b", + "answers": [], + "is_impossible": true + }], + "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands." + }] + }, { + "title": "Computational_complexity_theory", + "paragraphs": [{ + "qas": [{ + "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?", + "id": "56e16182e3433e1400422e28", + "answers": [{ + "text": "Computational complexity theory", + "answer_start": 0 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "algorithm", + "answer_start": 472 + }], + "question": "What is a manual application of mathematical steps?", + "id": "5ad5316b5b96ef001a10ab76", + "answers": [], + "is_impossible": true + }], + "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm." + }, { + "qas": [{ + "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?", + "id": "56e16839cd28a01900c67887", + "answers": [{ + "text": "if its solution requires significant resources", + "answer_start": 46 + }], + "is_impossible": false + }, { + "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?", + "id": "56e16839cd28a01900c67888", + "answers": [{ + "text": "mathematical models of computation", + "answer_start": 176 + }], + "is_impossible": false + }, { + "question": "What are two basic primary resources used to guage complexity?", + "id": "56e16839cd28a01900c67889", + "answers": [{ + "text": "time and storage", + "answer_start": 305 + }], + "is_impossible": false + }, { + "plausible_answers": [{ + "text": "the number of gates in a circuit", + "answer_start": 436 + }], + "question": "What unit is measured to determine circuit simplicity?", + "id": "5ad532575b96ef001a10ab7f", + "answers": [], + "is_impossible": true + }, { + "plausible_answers": [{ + "text": "the number of processors", + "answer_start": 502 + }], + "question": "What number is used in perpendicular computing?", + "id": "5ad532575b96ef001a10ab80", + "answers": [], + "is_impossible": true + }], + "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do." + }] + }] +} \ No newline at end of file diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 1f120d354a..f8449df045 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -630,12 +630,12 @@ def compute_predictions_log_probs( for i in range(start_n_top): for j in range(end_n_top): - start_log_prob = result.start_top_log_probs[i] + start_log_prob = result.start_logits[i] start_index = result.start_top_index[i] j_index = i * end_n_top + j - end_log_prob = result.end_top_log_probs[j_index] + end_log_prob = result.end_logits[j_index] end_index = result.end_top_index[j_index] # We could hypothetically create invalid predictions, e.g., predict diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 6599c54330..dd2d9d25c0 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -146,7 +146,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, token_to_orig_map = {} for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if not sequence_a_is_doc else i + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len @@ -166,7 +166,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if sequence_a_is_doc else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: @@ -179,7 +179,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, p_mask = np.minimum(p_mask, 1) - if not sequence_a_is_doc: + if tokenizer.padding_side == "right": # Limit positive values to one p_mask = 1 - p_mask @@ -207,7 +207,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = cls_index span_is_impossible = True else: - if sequence_a_is_doc: + if tokenizer.padding_side == "left": doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens @@ -270,7 +270,29 @@ class SquadProcessor(DataProcessor): ) def get_examples_from_dataset(self, dataset, evaluate=False): - """See base class.""" + """ + Creates a list of :class:`~transformers.data.processors.squad.SquadExample` using a TFDS dataset. + + Args: + dataset: The tfds dataset loaded from `tensorflow_datasets.load("squad")` + evaluate: boolean specifying if in evaluation mode or in training mode + + Returns: + List of SquadExample + + Examples:: + + import tensorflow_datasets as tfds + dataset = tfds.load("squad") + + training_examples = get_examples_from_dataset(dataset, evaluate=False) + evaluation_examples = get_examples_from_dataset(dataset, evaluate=True) + """ + + if evaluate: + dataset = dataset["validation"] + else: + dataset = dataset["train"] examples = [] for tensor_dict in tqdm(dataset): @@ -455,8 +477,8 @@ class SquadResult(object): end_logits: The logits corresponding to the end of the answer """ def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): - self.start_top_log_probs = start_logits - self.end_top_log_probs = end_logits + self.start_logits = start_logits + self.end_logits = end_logits self.unique_id = unique_id if start_top_index: From 33508ae310f101a2534d3e97ea23fda93e25ef38 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 16:26:45 -0500 Subject: [PATCH 045/302] Remove `only_first` --- transformers/data/processors/squad.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index dd2d9d25c0..09a79db471 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -300,29 +300,29 @@ class SquadProcessor(DataProcessor): return examples - def get_train_examples(self, data_dir, only_first=None): + def get_train_examples(self, data_dir): """See base class.""" if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "train", only_first) + return self._create_examples(input_data, "train") - def get_dev_examples(self, data_dir, only_first=None): + def get_dev_examples(self, data_dir): """See base class.""" if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] - return self._create_examples(input_data, "dev", only_first) + return self._create_examples(input_data, "dev") def get_labels(self): """See base class.""" return ["0", "1"] - def _create_examples(self, input_data, set_type, only_first=None): + def _create_examples(self, input_data, set_type): """Creates examples for the training and dev sets.""" is_training = set_type == "train" @@ -363,9 +363,6 @@ class SquadProcessor(DataProcessor): ) examples.append(example) - - if only_first is not None and len(examples) > only_first: - return examples return examples class SquadV1Processor(SquadProcessor): From 7a03519975e4f0b6698bf1221c2263ed0f8d795c Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 17:24:35 -0500 Subject: [PATCH 046/302] Documentation --- docs/source/main_classes/processors.rst | 79 +++++++++++++++++- transformers/data/processors/squad.py | 104 ++++++++++++++++++++---- 2 files changed, 164 insertions(+), 19 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index a85c126956..ce0eeb553a 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,4 +55,81 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. \ No newline at end of file +`run_glue.py `__ script. + + + +SQuAD +~~~~~~~~~~~~~~~~~~~~~ + +`The Stanford Question Answering Dataset (SQuAD) `__ is a benchmark that evaluates +the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper +`SQuAD: 100,000+ Questions for Machine Comprehension of Text `__. The second version (v2.0) was released alongside +the paper `Know What You Don't Know: Unanswerable Questions for SQuAD `__. + +This library hosts a processor for each of the two versions: + +Processors +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Those processors are: + - :class:`~transformers.data.processors.utils.SquadV1Processor` + - :class:`~transformers.data.processors.utils.SquadV2Processor` + +They both inherit from the abstract class :class:`~transformers.data.processors.utils.SquadProcessor` + +.. autoclass:: transformers.data.processors.squad.SquadProcessor + :members: + +Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures` +that can be used as model inputs. + +.. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features + +These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package. +Examples are given below. + +Example usage +^^^^^^^^^^^^^^^^^^^^^^^^^ + +Here is an example using the processors as well as the conversion method using data files: + +Example:: + + # Loading a V2 processor + processor = SquadV2Processor() + examples = processor.get_dev_examples(squad_v2_data_dir) + + # Loading a V1 processor + processor = SquadV1Processor() + examples = processor.get_dev_examples(squad_v1_data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + +Using `tensorflow_datasets` is as easy as using a data file: + +Example:: + + # tensorflow_datasets only handle Squad V1. + tfds_examples = tfds.load("squad") + examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=max_seq_length, + doc_stride=args.doc_stride, + max_query_length=max_query_length, + is_training=not evaluate, + ) + + +Another example using these processors is given in the +`run_squad.py `__ script. \ No newline at end of file diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 09a79db471..b17e626c98 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -74,7 +74,35 @@ def _is_whitespace(c): def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training): - """Loads a data file into a list of `InputBatch`s.""" + """ + Converts a list of examples into a list of features that can be directly given as input to a model. + It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. + + Args: + examples: list of :class:`~transformers.data.processors.squad.SquadExample` + tokenizer: an instance of a child of :class:`~transformers.PreTrainedTokenizer` + max_seq_length: The maximum sequence length of the inputs. + doc_stride: The stride used when the context is too large and is split across several features. + max_query_length: The maximum length of the query. + is_training: wheter to create features for model evaluation or model training. + + Returns: + list of :class:`~transformers.data.processors.squad.SquadFeatures` + + Example:: + + processor = SquadV2Processor() + examples = processor.get_dev_examples(data_dir) + + features = squad_convert_examples_to_features( + examples=examples, + tokenizer=tokenizer, + max_seq_length=args.max_seq_length, + doc_stride=args.doc_stride, + max_query_length=args.max_query_length, + is_training=not evaluate, + ) + """ # Defining helper methods unique_id = 1000000000 @@ -240,12 +268,14 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, class SquadProcessor(DataProcessor): - """Processor for the SQuAD data set.""" + """ + Processor for the SQuAD data set. + Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. + """ train_file = None dev_file = None - def get_example_from_tensor_dict(self, tensor_dict, evaluate=False): - + def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() @@ -296,35 +326,44 @@ class SquadProcessor(DataProcessor): examples = [] for tensor_dict in tqdm(dataset): - examples.append(self.get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples - def get_train_examples(self, data_dir): - """See base class.""" + def get_train_examples(self, data_dir, filename=None): + """ + Returns the training examples from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the training file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + + """ if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.train_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") - def get_dev_examples(self, data_dir): - """See base class.""" + def get_dev_examples(self, data_dir, filename=None): + """ + Returns the evaluation example from the data directory. + + Args: + data_dir: Directory containing the data files used for training and evaluating. + filename: None by default, specify this if the evaluation file has a different name than the original one + which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. + """ if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.dev_file), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") - def get_labels(self): - """See base class.""" - return ["0", "1"] - def _create_examples(self, input_data, set_type): - """Creates examples for the training and dev sets.""" - is_training = set_type == "train" examples = [] for entry in tqdm(input_data): @@ -378,6 +417,16 @@ class SquadV2Processor(SquadProcessor): class SquadExample(object): """ A single training/test example for the Squad dataset, as loaded from disk. + + Args: + qas_id: The example's unique identifier + question_text: The question string + context_text: The context string + answer_text: The answer string + start_position_character: The character position of the start of the answer + title: The title of the example + answers: None by default, this is used during evaluation. Holds answers as well as their start positions. + is_impossible: False by default, set to True if the example has no possible answer. """ def __init__(self, @@ -427,7 +476,26 @@ class SquadExample(object): class SquadFeatures(object): """ Single squad example features to be fed to a model. - Those features are model-specific. + Those features are model-specific and can be crafted from :class:`~transformers.data.processors.squad.SquadExample` + using the :method:`~transformers.data.processors.squad.squad_convert_examples_to_features` method. + + Args: + input_ids: Indices of input sequence tokens in the vocabulary. + attention_mask: Mask to avoid performing attention on padding token indices. + token_type_ids: Segment token indices to indicate first and second portions of the inputs. + cls_index: the index of the CLS token. + p_mask: Mask identifying tokens that can be answers vs. tokens that cannot. + Mask with 1 for tokens than cannot be in the answer and 0 for token that can be in an answer + example_index: the index of the example + unique_id: The unique Feature identifier + paragraph_len: The length of the context + token_is_max_context: List of booleans identifying which tokens have their maximum context in this feature object. + If a token does not have their maximum context in this feature object, it means that another feature object + has more information related to that token and should be prioritized over this feature for that token. + tokens: list of tokens corresponding to the input ids + token_to_orig_map: mapping between the tokens and the original text, needed in order to identify the answer. + start_position: start of the answer token index + end_position: end of the answer token index """ def __init__(self, From ce158a076f7089bf11d44e1581f5bcab4dcc5396 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 4 Dec 2019 17:55:52 -0500 Subject: [PATCH 047/302] Return dataset (pytorch) --- transformers/data/processors/squad.py | 41 ++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index b17e626c98..338bae0c51 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -7,7 +7,11 @@ import numpy as np from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures -from ...file_utils import is_tf_available +from ...file_utils import is_tf_available, is_torch_available + +if is_torch_available: + import torch + from torch.utils.data import TensorDataset if is_tf_available(): import tensorflow as tf @@ -73,7 +77,8 @@ def _is_whitespace(c): return False def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training): + doc_stride, max_query_length, is_training, + return_dataset=False): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -84,7 +89,10 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, max_seq_length: The maximum sequence length of the inputs. doc_stride: The stride used when the context is too large and is split across several features. max_query_length: The maximum length of the query. - is_training: wheter to create features for model evaluation or model training. + is_training: whether to create features for model evaluation or model training. + return_dataset: Default False. Either 'pt' or 'tf'. + if 'pt': returns a torch.data.TensorDataset, + if 'tf': returns a tf.data.Dataset Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` @@ -264,6 +272,31 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id += 1 + if return_dataset == 'pt': + if not is_torch_available(): + raise ImportError("Pytorch must be installed to return a pytorch dataset.") + + # Convert to Tensors and build dataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) + all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) + + if not is_training: + all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_example_index, all_cls_index, all_p_mask) + else: + all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) + all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) + dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, + all_start_positions, all_end_positions, + all_cls_index, all_p_mask) + + return features, dataset + + return features @@ -359,7 +392,7 @@ class SquadProcessor(DataProcessor): if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.dev_file if filename is not None else filename), "r", encoding='utf-8') as reader: + with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") From 7f998b1b832dd69cfdd8455afd5b8af3b2f77df8 Mon Sep 17 00:00:00 2001 From: Guillaume B Date: Thu, 5 Dec 2019 08:57:49 +0100 Subject: [PATCH 048/302] special_tokens_mask value was unused and calculated twice --- transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 5d683629f0..6be96989cb 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -910,7 +910,7 @@ class PreTrainedTokenizer(object): token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0)) if return_special_tokens_mask: - encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + encoded_inputs["special_tokens_mask"] = special_tokens_mask # Prepare inputs as tensors if asked if return_tensors == 'tf' and is_tf_available(): From f8fb4335c9cd79789ed6119e729348e0a1b51e2b Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 5 Dec 2019 15:19:32 +0100 Subject: [PATCH 049/302] clean up a little bit PT <=> TF conversion --- transformers/convert_pytorch_checkpoint_to_tf2.py | 9 +++++---- transformers/modeling_utils.py | 3 ++- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index d1776e9c14..d20eafe2e9 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -119,10 +119,11 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file tf_inputs = tf.constant(inputs_list) tfo = tf_model(tf_inputs, training=False) # build the network - pt_model = pt_model_class.from_pretrained(None, - config=config, - state_dict=torch.load(pytorch_checkpoint_path, - map_location='cpu')) + pt_model = pt_model_class(config) + pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'), + strict-False) + pt_model.eval() + pt_inputs = torch.tensor(inputs_list) with torch.no_grad(): pto = pt_model(pt_inputs) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 398172a88c..3ac568771e 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -318,7 +318,8 @@ class PreTrainedModel(nn.Module): model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path: + if pretrained_model_name_or_path is not None and ( + "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path): logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " + "https://github.com/google-research/google-research/issues/119 for more information.") From 9ecd83dace3961eaa161405814b00ea595c86451 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 5 Dec 2019 14:44:57 -0500 Subject: [PATCH 050/302] Patch evaluation for impossible values + cleanup --- docs/source/main_classes/processors.rst | 4 ++-- examples/run_squad.py | 25 +++++-------------------- transformers/data/processors/squad.py | 6 +++--- transformers/tokenization_utils.py | 2 +- 4 files changed, 11 insertions(+), 26 deletions(-) diff --git a/docs/source/main_classes/processors.rst b/docs/source/main_classes/processors.rst index ce0eeb553a..e98910ae1b 100644 --- a/docs/source/main_classes/processors.rst +++ b/docs/source/main_classes/processors.rst @@ -55,7 +55,7 @@ Example usage ^^^^^^^^^^^^^^^^^^^^^^^^^ An example using these processors is given in the -`run_glue.py `__ script. +`run_glue.py `__ script. @@ -132,4 +132,4 @@ Example:: Another example using these processors is given in the -`run_squad.py `__ script. \ No newline at end of file +`run_squad.py `__ script. \ No newline at end of file diff --git a/examples/run_squad.py b/examples/run_squad.py index 3f1b6a798f..5caff9ae4f 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -311,7 +311,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal str(args.max_seq_length))) if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) - features = torch.load(cached_features_file) + features_and_dataset = torch.load(cached_features_file) + features, dataset = features_and_dataset["features"], features_and_dataset["dataset"] else: logger.info("Creating features from dataset file at %s", input_dir) @@ -330,40 +331,24 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) - features = squad_convert_examples_to_features( + features, dataset = squad_convert_examples_to_features( examples=examples, tokenizer=tokenizer, max_seq_length=args.max_seq_length, doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, + return_dataset='pt' ) if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) - torch.save(features, cached_features_file) + torch.save({"features": features, "dataset": dataset}, cached_features_file) if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) - all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) - all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) - if evaluate: - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) - else: - all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) - if output_examples: return dataset, examples, features return dataset diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 338bae0c51..bb56aa792f 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -312,7 +312,7 @@ class SquadProcessor(DataProcessor): if not evaluate: answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') answer_start = tensor_dict['answers']['answer_start'][0].numpy() - answers = None + answers = [] else: answers = [{ "answer_start": start.numpy(), @@ -408,7 +408,7 @@ class SquadProcessor(DataProcessor): question_text = qa["question"] start_position_character = None answer_text = None - answers = None + answers = [] if "is_impossible" in qa: is_impossible = qa["is_impossible"] @@ -469,7 +469,7 @@ class SquadExample(object): answer_text, start_position_character, title, - answers=None, + answers=[], is_impossible=False): self.qas_id = qas_id self.question_text = question_text diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 41a611ea49..5ec173bbf6 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -194,7 +194,7 @@ class PreTrainedTokenizer(object): @property def pad_token_type_id(self): - """ Id of the padding token in the vocabulary. Log an error if used while not having been set. """ + """ Id of the padding token type in the vocabulary.""" return self._pad_token_type_id @property From e9217da5ff711cf84d150b35d3f8a5c17f1641f7 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 5 Dec 2019 16:01:51 -0500 Subject: [PATCH 051/302] Cleanup Improve global visibility on the run_squad script, remove unused files and fixes related to XLNet. --- examples/run_squad.py | 69 +- examples/utils_squad.py | 1017 -------------------- examples/utils_squad_evaluate.py | 330 ------- transformers/data/metrics/squad_metrics.py | 14 +- transformers/data/processors/squad.py | 2 +- 5 files changed, 45 insertions(+), 1387 deletions(-) delete mode 100644 examples/utils_squad.py delete mode 100644 examples/utils_squad_evaluate.py diff --git a/examples/run_squad.py b/examples/run_squad.py index 5caff9ae4f..6d32211c0c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -27,8 +27,7 @@ import glob import timeit import numpy as np import torch -from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, - TensorDataset) +from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler, TensorDataset) from torch.utils.data.distributed import DistributedSampler try: @@ -48,14 +47,6 @@ from transformers import (WEIGHTS_NAME, BertConfig, from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features -from utils_squad import (convert_examples_to_features as old_convert, read_squad_examples as old_read, RawResult, write_predictions, - RawResultExtended, write_predictions_extended) - -# The follwing import is the official SQuAD evaluation script (2.0). -# You can remove it from the dependencies if you are using this script outside of the library -# We've added it here for automated tests (see examples/test_examples.py file) -from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad - logger = logging.getLogger(__name__) ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ @@ -98,14 +89,16 @@ def train(args, train_dataset, model, tokenizer): optimizer_grouped_parameters = [ {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} - ] + ] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) + if args.fp16: try: from apex import amp except ImportError: raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") + model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) @@ -133,20 +126,26 @@ def train(args, train_dataset, model, tokenizer): model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed(args) # Added here for reproductibility (even between python 2 and 3) + for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): model.train() batch = tuple(t.to(args.device) for t in batch) - inputs = {'input_ids': batch[0], - 'attention_mask': batch[1], - 'start_positions': batch[3], - 'end_positions': batch[4]} + + inputs = { + 'input_ids': batch[0], + 'attention_mask': batch[1], + 'start_positions': batch[3], + 'end_positions': batch[4] + } + if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] + if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[5], - 'p_mask': batch[6]}) + inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) + outputs = model(**inputs) loss = outputs[0] # model outputs are always tuple in transformers (see doc) @@ -173,8 +172,8 @@ def train(args, train_dataset, model, tokenizer): model.zero_grad() global_step += 1 + # Log metrics if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0: - # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): @@ -183,8 +182,8 @@ def train(args, train_dataset, model, tokenizer): tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step) logging_loss = tr_loss + # Save model checkpoint if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0: - # Save model checkpoint output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) @@ -213,6 +212,7 @@ def evaluate(args, model, tokenizer, prefix=""): os.makedirs(args.output_dir) args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) + # Note that DistributedSampler samples randomly eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset) eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) @@ -225,11 +225,14 @@ def evaluate(args, model, tokenizer, prefix=""): logger.info("***** Running evaluation {} *****".format(prefix)) logger.info(" Num examples = %d", len(dataset)) logger.info(" Batch size = %d", args.eval_batch_size) + all_results = [] start_time = timeit.default_timer() + for batch in tqdm(eval_dataloader, desc="Evaluating"): model.eval() batch = tuple(t.to(args.device) for t in batch) + with torch.no_grad(): inputs = { 'input_ids': batch[0], @@ -238,10 +241,13 @@ def evaluate(args, model, tokenizer, prefix=""): if args.model_type != 'distilbert': inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids + example_indices = batch[3] + + # XLNet and XLM use more arguments for their predictions if args.model_type in ['xlnet', 'xlm']: - inputs.update({'cls_index': batch[4], - 'p_mask': batch[5]}) + inputs.update({'cls_index': batch[4], 'p_mask': batch[5]}) + outputs = model(**inputs) for i, example_index in enumerate(example_indices): @@ -250,11 +256,13 @@ def evaluate(args, model, tokenizer, prefix=""): output = [to_list(output[i]) for output in outputs] + # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler" + # models only use two. if len(output) >= 5: start_logits = output[0] start_top_index = output[1] end_logits = output[2] - end_top_index = output[3], + end_top_index = output[3] cls_logits = output[4] result = SquadResult( @@ -278,16 +286,17 @@ def evaluate(args, model, tokenizer, prefix=""): # Compute predictions output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix)) output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix)) + if args.version_2_with_negative: output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix)) else: output_null_log_odds_file = None + # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: - # XLNet uses a more complex post-processing procedure predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, - output_nbest_file, output_null_log_odds_file, args.predict_file, + output_nbest_file, output_null_log_odds_file, model.config.start_n_top, model.config.end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: @@ -296,6 +305,7 @@ def evaluate(args, model, tokenizer, prefix=""): output_nbest_file, output_null_log_odds_file, args.verbose_logging, args.version_2_with_negative, args.null_score_diff_threshold) + # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) return results @@ -308,7 +318,10 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format( 'dev' if evaluate else 'train', list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length))) + str(args.max_seq_length)) + ) + + # Init features and dataset from cache if it exists if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples: logger.info("Loading features from cached file %s", cached_features_file) features_and_dataset = torch.load(cached_features_file) @@ -341,7 +354,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal return_dataset='pt' ) - if args.local_rank in [-1, 0]: logger.info("Saving features into cached file %s", cached_features_file) torch.save({"features": features, "dataset": dataset}, cached_features_file) @@ -452,6 +464,11 @@ def main(): parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() + args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format( + list(filter(None, args.model_name_or_path.split('/'))).pop(), + str(args.max_seq_length)) + ) + if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) diff --git a/examples/utils_squad.py b/examples/utils_squad.py deleted file mode 100644 index 4f1c581588..0000000000 --- a/examples/utils_squad.py +++ /dev/null @@ -1,1017 +0,0 @@ - -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Load SQuAD dataset. """ - -from __future__ import absolute_import, division, print_function - -import json -import logging -import math -import collections -from io import open -from tqdm import tqdm - -from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize - -# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method) -from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores - -logger = logging.getLogger(__name__) - - -class SquadExample(object): - """ - A single training/test example for the Squad dataset. - For examples without an answer, the start and end position are -1. - """ - - def __init__(self, - qas_id, - question_text, - doc_tokens, - orig_answer_text=None, - start_position=None, - end_position=None, - is_impossible=None): - self.qas_id = qas_id - self.question_text = question_text - self.doc_tokens = doc_tokens - self.orig_answer_text = orig_answer_text - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - def __str__(self): - return self.__repr__() - - def __repr__(self): - s = "" - s += "qas_id: %s" % (self.qas_id) - s += ", question_text: %s" % ( - self.question_text) - s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) - if self.start_position: - s += ", start_position: %d" % (self.start_position) - if self.end_position: - s += ", end_position: %d" % (self.end_position) - if self.is_impossible: - s += ", is_impossible: %r" % (self.is_impossible) - return s - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - unique_id, - example_index, - doc_span_index, - tokens, - token_to_orig_map, - token_is_max_context, - input_ids, - input_mask, - segment_ids, - cls_index, - p_mask, - paragraph_len, - start_position=None, - end_position=None, - is_impossible=None): - self.unique_id = unique_id - self.example_index = example_index - self.doc_span_index = doc_span_index - self.tokens = tokens - self.token_to_orig_map = token_to_orig_map - self.token_is_max_context = token_is_max_context - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.cls_index = cls_index - self.p_mask = p_mask - self.paragraph_len = paragraph_len - self.start_position = start_position - self.end_position = end_position - self.is_impossible = is_impossible - - -def read_squad_examples(input_file, is_training, version_2_with_negative): - """Read a SQuAD json file into a list of SquadExample.""" - with open(input_file, "r", encoding='utf-8') as reader: - input_data = json.load(reader)["data"] - - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: - return True - return False - - examples = [] - for entry in input_data: - for paragraph in entry["paragraphs"]: - paragraph_text = paragraph["context"] - doc_tokens = [] - char_to_word_offset = [] - prev_is_whitespace = True - for c in paragraph_text: - if is_whitespace(c): - prev_is_whitespace = True - else: - if prev_is_whitespace: - doc_tokens.append(c) - else: - doc_tokens[-1] += c - prev_is_whitespace = False - char_to_word_offset.append(len(doc_tokens) - 1) - - for qa in paragraph["qas"]: - qas_id = qa["id"] - question_text = qa["question"] - start_position = None - end_position = None - orig_answer_text = None - is_impossible = False - if is_training: - if version_2_with_negative: - is_impossible = qa["is_impossible"] - if (len(qa["answers"]) != 1) and (not is_impossible): - raise ValueError( - "For training, each question should have exactly 1 answer.") - if not is_impossible: - answer = qa["answers"][0] - orig_answer_text = answer["text"] - answer_offset = answer["answer_start"] - answer_length = len(orig_answer_text) - start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[answer_offset + answer_length - 1] - # Only add answers where the text can be exactly recovered from the - # document. If this CAN'T happen it's likely due to weird Unicode - # stuff so we will just skip the example. - # - # Note that this means for training mode, every example is NOT - # guaranteed to be preserved. - actual_text = " ".join(doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join( - whitespace_tokenize(orig_answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", - actual_text, cleaned_answer_text) - continue - else: - start_position = -1 - end_position = -1 - orig_answer_text = "" - - example = SquadExample( - qas_id=qas_id, - question_text=question_text, - doc_tokens=doc_tokens, - orig_answer_text=orig_answer_text, - start_position=start_position, - end_position=end_position, - is_impossible=is_impossible) - examples.append(example) - return examples - - -def convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - cls_token_at_end=False, - cls_token='[CLS]', sep_token='[SEP]', pad_token=0, - sequence_a_segment_id=0, sequence_b_segment_id=1, - cls_token_segment_id=0, pad_token_segment_id=0, - mask_padding_with_zero=True, - sequence_a_is_doc=False): - """Loads a data file into a list of `InputBatch`s.""" - - unique_id = 1000000000 - # cnt_pos, cnt_neg = 0, 0 - # max_N, max_M = 1024, 1024 - # f = np.zeros((max_N, max_M), dtype=np.float32) - - features = [] - for (example_index, example) in enumerate(tqdm(examples)): - - # if example_index % 100 == 0: - # logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg) - - query_tokens = tokenizer.tokenize(example.question_text) - - if len(query_tokens) > max_query_length: - query_tokens = query_tokens[0:max_query_length] - - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - tok_start_position = None - tok_end_position = None - if is_training and example.is_impossible: - tok_start_position = -1 - tok_end_position = -1 - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, - example.orig_answer_text) - - # The -3 accounts for [CLS], [SEP] and [SEP] - max_tokens_for_doc = max_seq_length - len(query_tokens) - 3 - assert max_tokens_for_doc > 0 - - # We can have documents that are longer than the maximum sequence length. - # To deal with this we do a sliding window approach, where we take chunks - # of the up to our max length with a stride of `doc_stride`. - _DocSpan = collections.namedtuple( # pylint: disable=invalid-name - "DocSpan", ["start", "length"]) - doc_spans = [] - start_offset = 0 - while start_offset < len(all_doc_tokens): - length = len(all_doc_tokens) - start_offset - if length > max_tokens_for_doc: - length = max_tokens_for_doc - doc_spans.append(_DocSpan(start=start_offset, length=length)) - if start_offset + length == len(all_doc_tokens): - break - start_offset += min(length, doc_stride) - - for (doc_span_index, doc_span) in enumerate(doc_spans): - tokens = [] - token_to_orig_map = {} - token_is_max_context = {} - segment_ids = [] - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = [] - - # CLS token at the beginning - if not cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = 0 - - # XLNet: P SEP Q SEP CLS - # Others: CLS Q SEP P SEP - if not sequence_a_is_doc: - # Query - tokens += query_tokens - segment_ids += [sequence_a_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - # Paragraph - for i in range(doc_span.length): - split_token_index = doc_span.start + i - token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index] - - is_max_context = _check_is_max_context(doc_spans, doc_span_index, - split_token_index) - token_is_max_context[len(tokens)] = is_max_context - tokens.append(all_doc_tokens[split_token_index]) - if not sequence_a_is_doc: - segment_ids.append(sequence_b_segment_id) - else: - segment_ids.append(sequence_a_segment_id) - p_mask.append(0) - paragraph_len = doc_span.length - - if sequence_a_is_doc: - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_a_segment_id) - p_mask.append(1) - - tokens += query_tokens - segment_ids += [sequence_b_segment_id] * len(query_tokens) - p_mask += [1] * len(query_tokens) - - # SEP token - tokens.append(sep_token) - segment_ids.append(sequence_b_segment_id) - p_mask.append(1) - - # CLS token at the end - if cls_token_at_end: - tokens.append(cls_token) - segment_ids.append(cls_token_segment_id) - p_mask.append(0) - cls_index = len(tokens) - 1 # Index of classification token - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(pad_token) - input_mask.append(0 if mask_padding_with_zero else 1) - segment_ids.append(pad_token_segment_id) - p_mask.append(1) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - span_is_impossible = example.is_impossible - start_position = None - end_position = None - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = doc_span.start - doc_end = doc_span.start + doc_span.length - 1 - out_of_span = False - if not (tok_start_position >= doc_start and - tok_end_position <= doc_end): - out_of_span = True - if out_of_span: - start_position = 0 - end_position = 0 - span_is_impossible = True - else: - if sequence_a_is_doc: - doc_offset = 0 - else: - doc_offset = len(query_tokens) + 2 - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - if is_training and span_is_impossible: - start_position = cls_index - end_position = cls_index - - if example_index < 20: - logger.info("*** Example ***") - logger.info("unique_id: %s" % (unique_id)) - logger.info("example_index: %s" % (example_index)) - logger.info("doc_span_index: %s" % (doc_span_index)) - logger.info("tokens: %s" % " ".join(tokens)) - logger.info("token_to_orig_map: %s" % " ".join([ - "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()])) - logger.info("token_is_max_context: %s" % " ".join([ - "%d:%s" % (x, y) for (x, y) in token_is_max_context.items() - ])) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info( - "input_mask: %s" % " ".join([str(x) for x in input_mask])) - logger.info( - "segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - if is_training and span_is_impossible: - logger.info("impossible example") - if is_training and not span_is_impossible: - answer_text = " ".join(tokens[start_position:(end_position + 1)]) - logger.info("start_position: %d" % (start_position)) - logger.info("end_position: %d" % (end_position)) - logger.info( - "answer: %s" % (answer_text)) - - features.append( - InputFeatures( - unique_id=unique_id, - example_index=example_index, - doc_span_index=doc_span_index, - tokens=tokens, - token_to_orig_map=token_to_orig_map, - token_is_max_context=token_is_max_context, - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - cls_index=cls_index, - p_mask=p_mask, - paragraph_len=paragraph_len, - start_position=start_position, - end_position=end_position, - is_impossible=span_is_impossible)) - unique_id += 1 - - return features - - -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): - """Returns tokenized answer spans that better match the annotated answer.""" - - # The SQuAD annotations are character based. We first project them to - # whitespace-tokenized words. But then after WordPiece tokenization, we can - # often find a "better match". For example: - # - # Question: What year was John Smith born? - # Context: The leader was John Smith (1895-1943). - # Answer: 1895 - # - # The original whitespace-tokenized answer will be "(1895-1943).". However - # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match - # the exact answer, 1895. - # - # However, this is not always possible. Consider the following: - # - # Question: What country is the top exporter of electornics? - # Context: The Japanese electronics industry is the lagest in the world. - # Answer: Japan - # - # In this case, the annotator chose "Japan" as a character sub-span of - # the word "Japanese". Since our WordPiece tokenizer does not split - # "Japanese", we just use "Japanese" as the annotation. This is fairly rare - # in SQuAD, but does happen. - tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) - - for new_start in range(input_start, input_end + 1): - for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) - if text_span == tok_answer_text: - return (new_start, new_end) - - return (input_start, input_end) - - -def _check_is_max_context(doc_spans, cur_span_index, position): - """Check if this is the 'max context' doc span for the token.""" - - # Because of the sliding window approach taken to scoring documents, a single - # token can appear in multiple documents. E.g. - # Doc: the man went to the store and bought a gallon of milk - # Span A: the man went to the - # Span B: to the store and bought - # Span C: and bought a gallon of - # ... - # - # Now the word 'bought' will have two scores from spans B and C. We only - # want to consider the score with "maximum context", which we define as - # the *minimum* of its left and right context (the *sum* of left and - # right context will always be the same, of course). - # - # In the example the maximum context for 'bought' would be span C since - # it has 1 left context and 3 right context, while span B has 4 left context - # and 0 right context. - best_score = None - best_span_index = None - for (span_index, doc_span) in enumerate(doc_spans): - end = doc_span.start + doc_span.length - 1 - if position < doc_span.start: - continue - if position > end: - continue - num_left_context = position - doc_span.start - num_right_context = end - position - score = min(num_left_context, num_right_context) + 0.01 * doc_span.length - if best_score is None or score > best_score: - best_score = score - best_span_index = span_index - - return cur_span_index == best_span_index - - -RawResult = collections.namedtuple("RawResult", - ["unique_id", "start_logits", "end_logits"]) - -def write_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, output_prediction_file, - output_nbest_file, output_null_log_odds_file, verbose_logging, - version_2_with_negative, null_score_diff_threshold): - """Write final predictions to the json file and log-odds of null if needed.""" - logger.info("Writing predictions to: %s" % (output_prediction_file)) - logger.info("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", "start_logit", "end_logit"]) - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - min_null_feature_index = 0 # the paragraph slice with min null score - null_start_logit = 0 # the start logit at the slice with min null score - null_end_logit = 0 # the end logit at the slice with min null score - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - start_indexes = _get_best_indexes(result.start_logits, n_best_size) - end_indexes = _get_best_indexes(result.end_logits, n_best_size) - # if we could have irrelevant answers, get the min score of irrelevant - if version_2_with_negative: - feature_null_score = result.start_logits[0] + result.end_logits[0] - if feature_null_score < score_null: - score_null = feature_null_score - min_null_feature_index = feature_index - null_start_logit = result.start_logits[0] - null_end_logit = result.end_logits[0] - for start_index in start_indexes: - for end_index in end_indexes: - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= len(feature.tokens): - continue - if end_index >= len(feature.tokens): - continue - if start_index not in feature.token_to_orig_map: - continue - if end_index not in feature.token_to_orig_map: - continue - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_logit=result.start_logits[start_index], - end_logit=result.end_logits[end_index])) - if version_2_with_negative: - prelim_predictions.append( - _PrelimPrediction( - feature_index=min_null_feature_index, - start_index=0, - end_index=0, - start_logit=null_start_logit, - end_logit=null_end_logit)) - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_logit + x.end_logit), - reverse=True) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_logit", "end_logit"]) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - if pred.start_index > 0: # this is a non-null prediction - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) - - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - else: - final_text = "" - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_logit=pred.start_logit, - end_logit=pred.end_logit)) - # if we didn't include the empty option in the n-best, include it - if version_2_with_negative: - if "" not in seen_predictions: - nbest.append( - _NbestPrediction( - text="", - start_logit=null_start_logit, - end_logit=null_end_logit)) - - # In very rare edge cases we could only have single null prediction. - # So we just create a nonce prediction in this case to avoid failure. - if len(nbest)==1: - nbest.insert(0, - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0)) - - assert len(nbest) >= 1 - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_logit + entry.end_logit) - if not best_non_null_entry: - if entry.text: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_logit"] = entry.start_logit - output["end_logit"] = entry.end_logit - nbest_json.append(output) - - assert len(nbest_json) >= 1 - - if not version_2_with_negative: - all_predictions[example.qas_id] = nbest_json[0]["text"] - else: - # predict "" iff the null score - the score of best non-null > threshold - score_diff = score_null - best_non_null_entry.start_logit - ( - best_non_null_entry.end_logit) - scores_diff_json[example.qas_id] = score_diff - if score_diff > null_score_diff_threshold: - all_predictions[example.qas_id] = "" - else: - all_predictions[example.qas_id] = best_non_null_entry.text - all_nbest_json[example.qas_id] = nbest_json - - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if version_2_with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - return all_predictions - - -# For XLNet (and XLM which uses the same head) -RawResultExtended = collections.namedtuple("RawResultExtended", - ["unique_id", "start_top_log_probs", "start_top_index", - "end_top_log_probs", "end_top_index", "cls_logits"]) - - -def write_predictions_extended(all_examples, all_features, all_results, n_best_size, - max_answer_length, output_prediction_file, - output_nbest_file, - output_null_log_odds_file, orig_data_file, - start_n_top, end_n_top, version_2_with_negative, - tokenizer, verbose_logging): - """ XLNet write prediction logic (more complex than Bert's). - Write final predictions to the json file and log-odds of null if needed. - - Requires utils_squad_evaluate.py - """ - _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name - "PrelimPrediction", - ["feature_index", "start_index", "end_index", - "start_log_prob", "end_log_prob"]) - - _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name - "NbestPrediction", ["text", "start_log_prob", "end_log_prob"]) - - logger.info("Writing predictions to: %s", output_prediction_file) - # logger.info("Writing nbest to: %s" % (output_nbest_file)) - - example_index_to_features = collections.defaultdict(list) - for feature in all_features: - example_index_to_features[feature.example_index].append(feature) - - unique_id_to_result = {} - for result in all_results: - unique_id_to_result[result.unique_id] = result - - all_predictions = collections.OrderedDict() - all_nbest_json = collections.OrderedDict() - scores_diff_json = collections.OrderedDict() - - for (example_index, example) in enumerate(all_examples): - features = example_index_to_features[example_index] - - prelim_predictions = [] - # keep track of the minimum score of null start+end of position 0 - score_null = 1000000 # large and positive - - for (feature_index, feature) in enumerate(features): - result = unique_id_to_result[feature.unique_id] - - cur_null_score = result.cls_logits - - # if we could have irrelevant answers, get the min score of irrelevant - score_null = min(score_null, cur_null_score) - - for i in range(start_n_top): - for j in range(end_n_top): - start_log_prob = result.start_top_log_probs[i] - start_index = result.start_top_index[i] - - j_index = i * end_n_top + j - - end_log_prob = result.end_top_log_probs[j_index] - end_index = result.end_top_index[j_index] - - # We could hypothetically create invalid predictions, e.g., predict - # that the start of the span is in the question. We throw out all - # invalid predictions. - if start_index >= feature.paragraph_len - 1: - continue - if end_index >= feature.paragraph_len - 1: - continue - - if not feature.token_is_max_context.get(start_index, False): - continue - if end_index < start_index: - continue - length = end_index - start_index + 1 - if length > max_answer_length: - continue - - prelim_predictions.append( - _PrelimPrediction( - feature_index=feature_index, - start_index=start_index, - end_index=end_index, - start_log_prob=start_log_prob, - end_log_prob=end_log_prob)) - - prelim_predictions = sorted( - prelim_predictions, - key=lambda x: (x.start_log_prob + x.end_log_prob), - reverse=True) - - seen_predictions = {} - nbest = [] - for pred in prelim_predictions: - if len(nbest) >= n_best_size: - break - feature = features[pred.feature_index] - - # XLNet un-tokenizer - # Let's keep it simple for now and see if we need all this later. - # - # tok_start_to_orig_index = feature.tok_start_to_orig_index - # tok_end_to_orig_index = feature.tok_end_to_orig_index - # start_orig_pos = tok_start_to_orig_index[pred.start_index] - # end_orig_pos = tok_end_to_orig_index[pred.end_index] - # paragraph_text = example.paragraph_text - # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip() - - # Previously used Bert untokenizer - tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)] - orig_doc_start = feature.token_to_orig_map[pred.start_index] - orig_doc_end = feature.token_to_orig_map[pred.end_index] - orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = tokenizer.convert_tokens_to_string(tok_tokens) - - # Clean whitespace - tok_text = tok_text.strip() - tok_text = " ".join(tok_text.split()) - orig_text = " ".join(orig_tokens) - - final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, - verbose_logging) - - if final_text in seen_predictions: - continue - - seen_predictions[final_text] = True - - nbest.append( - _NbestPrediction( - text=final_text, - start_log_prob=pred.start_log_prob, - end_log_prob=pred.end_log_prob)) - - # In very rare edge cases we could have no valid predictions. So we - # just create a nonce prediction in this case to avoid failure. - if not nbest: - nbest.append( - _NbestPrediction(text="", start_log_prob=-1e6, - end_log_prob=-1e6)) - - total_scores = [] - best_non_null_entry = None - for entry in nbest: - total_scores.append(entry.start_log_prob + entry.end_log_prob) - if not best_non_null_entry: - best_non_null_entry = entry - - probs = _compute_softmax(total_scores) - - nbest_json = [] - for (i, entry) in enumerate(nbest): - output = collections.OrderedDict() - output["text"] = entry.text - output["probability"] = probs[i] - output["start_log_prob"] = entry.start_log_prob - output["end_log_prob"] = entry.end_log_prob - nbest_json.append(output) - - assert len(nbest_json) >= 1 - assert best_non_null_entry is not None - - score_diff = score_null - scores_diff_json[example.qas_id] = score_diff - # note(zhiliny): always predict best_non_null_entry - # and the evaluation script will search for the best threshold - all_predictions[example.qas_id] = best_non_null_entry.text - - all_nbest_json[example.qas_id] = nbest_json - - with open(output_prediction_file, "w") as writer: - writer.write(json.dumps(all_predictions, indent=4) + "\n") - - with open(output_nbest_file, "w") as writer: - writer.write(json.dumps(all_nbest_json, indent=4) + "\n") - - if version_2_with_negative: - with open(output_null_log_odds_file, "w") as writer: - writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - - with open(orig_data_file, "r", encoding='utf-8') as reader: - orig_data = json.load(reader)["data"] - - qid_to_has_ans = make_qid_to_has_ans(orig_data) - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) - out_eval = {} - - find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) - - return out_eval - - -def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False): - """Project the tokenized prediction back to the original text.""" - - # When we created the data, we kept track of the alignment between original - # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So - # now `orig_text` contains the span of our original text corresponding to the - # span that we predicted. - # - # However, `orig_text` may contain extra characters that we don't want in - # our prediction. - # - # For example, let's say: - # pred_text = steve smith - # orig_text = Steve Smith's - # - # We don't want to return `orig_text` because it contains the extra "'s". - # - # We don't want to return `pred_text` because it's already been normalized - # (the SQuAD eval script also does punctuation stripping/lower casing but - # our tokenizer does additional normalization like stripping accent - # characters). - # - # What we really want to return is "Steve Smith". - # - # Therefore, we have to apply a semi-complicated alignment heuristic between - # `pred_text` and `orig_text` to get a character-to-character alignment. This - # can fail in certain cases in which case we just return `orig_text`. - - def _strip_spaces(text): - ns_chars = [] - ns_to_s_map = collections.OrderedDict() - for (i, c) in enumerate(text): - if c == " ": - continue - ns_to_s_map[len(ns_chars)] = i - ns_chars.append(c) - ns_text = "".join(ns_chars) - return (ns_text, ns_to_s_map) - - # We first tokenize `orig_text`, strip whitespace from the result - # and `pred_text`, and check if they are the same length. If they are - # NOT the same length, the heuristic has failed. If they are the same - # length, we assume the characters are one-to-one aligned. - tokenizer = BasicTokenizer(do_lower_case=do_lower_case) - - tok_text = " ".join(tokenizer.tokenize(orig_text)) - - start_position = tok_text.find(pred_text) - if start_position == -1: - if verbose_logging: - logger.info( - "Unable to find text: '%s' in '%s'" % (pred_text, orig_text)) - return orig_text - end_position = start_position + len(pred_text) - 1 - - (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text) - (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text) - - if len(orig_ns_text) != len(tok_ns_text): - if verbose_logging: - logger.info("Length not equal after stripping spaces: '%s' vs '%s'", - orig_ns_text, tok_ns_text) - return orig_text - - # We then project the characters in `pred_text` back to `orig_text` using - # the character-to-character alignment. - tok_s_to_ns_map = {} - for (i, tok_index) in tok_ns_to_s_map.items(): - tok_s_to_ns_map[tok_index] = i - - orig_start_position = None - if start_position in tok_s_to_ns_map: - ns_start_position = tok_s_to_ns_map[start_position] - if ns_start_position in orig_ns_to_s_map: - orig_start_position = orig_ns_to_s_map[ns_start_position] - - if orig_start_position is None: - if verbose_logging: - logger.info("Couldn't map start position") - return orig_text - - orig_end_position = None - if end_position in tok_s_to_ns_map: - ns_end_position = tok_s_to_ns_map[end_position] - if ns_end_position in orig_ns_to_s_map: - orig_end_position = orig_ns_to_s_map[ns_end_position] - - if orig_end_position is None: - if verbose_logging: - logger.info("Couldn't map end position") - return orig_text - - output_text = orig_text[orig_start_position:(orig_end_position + 1)] - return output_text - - -def _get_best_indexes(logits, n_best_size): - """Get the n-best logits from a list.""" - index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True) - - best_indexes = [] - for i in range(len(index_and_score)): - if i >= n_best_size: - break - best_indexes.append(index_and_score[i][0]) - return best_indexes - - -def _compute_softmax(scores): - """Compute softmax probability over raw logits.""" - if not scores: - return [] - - max_score = None - for score in scores: - if max_score is None or score > max_score: - max_score = score - - exp_scores = [] - total_sum = 0.0 - for score in scores: - x = math.exp(score - max_score) - exp_scores.append(x) - total_sum += x - - probs = [] - for score in exp_scores: - probs.append(score / total_sum) - return probs diff --git a/examples/utils_squad_evaluate.py b/examples/utils_squad_evaluate.py deleted file mode 100644 index ed162e6fe6..0000000000 --- a/examples/utils_squad_evaluate.py +++ /dev/null @@ -1,330 +0,0 @@ -""" Official evaluation script for SQuAD version 2.0. - Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0 - -In addition to basic functionality, we also compute additional statistics and -plot precision-recall curves if an additional na_prob.json file is provided. -This file is expected to map question ID's to the model's predicted probability -that a question is unanswerable. -""" -import argparse -import collections -import json -import numpy as np -import os -import re -import string -import sys - -class EVAL_OPTS(): - def __init__(self, data_file, pred_file, out_file="", - na_prob_file="na_prob.json", na_prob_thresh=1.0, - out_image_dir=None, verbose=False): - self.data_file = data_file - self.pred_file = pred_file - self.out_file = out_file - self.na_prob_file = na_prob_file - self.na_prob_thresh = na_prob_thresh - self.out_image_dir = out_image_dir - self.verbose = verbose - -OPTS = None - -def parse_args(): - parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') - parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') - parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') - parser.add_argument('--out-file', '-o', metavar='eval.json', - help='Write accuracy metrics to file (default is stdout).') - parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', - help='Model estimates of probability of no answer.') - parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, - help='Predict "" if no-answer probability exceeds this (default = 1.0).') - parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, - help='Save precision-recall curves to directory.') - parser.add_argument('--verbose', '-v', action='store_true') - if len(sys.argv) == 1: - parser.print_help() - sys.exit(1) - return parser.parse_args() - -def make_qid_to_has_ans(dataset): - qid_to_has_ans = {} - for article in dataset: - for p in article['paragraphs']: - for qa in p['qas']: - qid_to_has_ans[qa['id']] = bool(qa['answers']) - return qid_to_has_ans - -def normalize_answer(s): - """Lower text and remove punctuation, articles and extra whitespace.""" - def remove_articles(text): - regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) - return re.sub(regex, ' ', text) - def white_space_fix(text): - return ' '.join(text.split()) - def remove_punc(text): - exclude = set(string.punctuation) - return ''.join(ch for ch in text if ch not in exclude) - def lower(text): - return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(s)))) - -def get_tokens(s): - if not s: return [] - return normalize_answer(s).split() - -def compute_exact(a_gold, a_pred): - return int(normalize_answer(a_gold) == normalize_answer(a_pred)) - -def compute_f1(a_gold, a_pred): - gold_toks = get_tokens(a_gold) - pred_toks = get_tokens(a_pred) - common = collections.Counter(gold_toks) & collections.Counter(pred_toks) - num_same = sum(common.values()) - if len(gold_toks) == 0 or len(pred_toks) == 0: - # If either is no-answer, then F1 is 1 if they agree, 0 otherwise - return int(gold_toks == pred_toks) - if num_same == 0: - return 0 - precision = 1.0 * num_same / len(pred_toks) - recall = 1.0 * num_same / len(gold_toks) - f1 = (2 * precision * recall) / (precision + recall) - return f1 - -def get_raw_scores(dataset, preds): - exact_scores = {} - f1_scores = {} - for article in dataset: - for p in article['paragraphs']: - for qa in p['qas']: - qid = qa['id'] - gold_answers = [a['text'] for a in qa['answers'] - if normalize_answer(a['text'])] - if not gold_answers: - # For unanswerable questions, only correct answer is empty string - gold_answers = [''] - if qid not in preds: - print('Missing prediction for %s' % qid) - continue - a_pred = preds[qid] - # Take max over all gold answers - exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) - f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) - return exact_scores, f1_scores - -def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): - new_scores = {} - for qid, s in scores.items(): - pred_na = na_probs[qid] > na_prob_thresh - if pred_na: - new_scores[qid] = float(not qid_to_has_ans[qid]) - else: - new_scores[qid] = s - return new_scores - -def make_eval_dict(exact_scores, f1_scores, qid_list=None): - if not qid_list: - total = len(exact_scores) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores.values()) / total), - ('f1', 100.0 * sum(f1_scores.values()) / total), - ('total', total), - ]) - else: - total = len(qid_list) - return collections.OrderedDict([ - ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), - ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), - ('total', total), - ]) - -def merge_eval(main_eval, new_eval, prefix): - for k in new_eval: - main_eval['%s_%s' % (prefix, k)] = new_eval[k] - -def plot_pr_curve(precisions, recalls, out_image, title): - plt.step(recalls, precisions, color='b', alpha=0.2, where='post') - plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') - plt.xlabel('Recall') - plt.ylabel('Precision') - plt.xlim([0.0, 1.05]) - plt.ylim([0.0, 1.05]) - plt.title(title) - plt.savefig(out_image) - plt.clf() - -def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, - out_image=None, title=None): - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - true_pos = 0.0 - cur_p = 1.0 - cur_r = 0.0 - precisions = [1.0] - recalls = [0.0] - avg_prec = 0.0 - for i, qid in enumerate(qid_list): - if qid_to_has_ans[qid]: - true_pos += scores[qid] - cur_p = true_pos / float(i+1) - cur_r = true_pos / float(num_true_pos) - if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: - # i.e., if we can put a threshold after this point - avg_prec += cur_p * (cur_r - recalls[-1]) - precisions.append(cur_p) - recalls.append(cur_r) - if out_image: - plot_pr_curve(precisions, recalls, out_image, title) - return {'ap': 100.0 * avg_prec} - -def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, - qid_to_has_ans, out_image_dir): - if out_image_dir and not os.path.exists(out_image_dir): - os.makedirs(out_image_dir) - num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) - if num_true_pos == 0: - return - pr_exact = make_precision_recall_eval( - exact_raw, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_exact.png'), - title='Precision-Recall curve for Exact Match score') - pr_f1 = make_precision_recall_eval( - f1_raw, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_f1.png'), - title='Precision-Recall curve for F1 score') - oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} - pr_oracle = make_precision_recall_eval( - oracle_scores, na_probs, num_true_pos, qid_to_has_ans, - out_image=os.path.join(out_image_dir, 'pr_oracle.png'), - title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') - merge_eval(main_eval, pr_exact, 'pr_exact') - merge_eval(main_eval, pr_f1, 'pr_f1') - merge_eval(main_eval, pr_oracle, 'pr_oracle') - -def histogram_na_prob(na_probs, qid_list, image_dir, name): - if not qid_list: - return - x = [na_probs[k] for k in qid_list] - weights = np.ones_like(x) / float(len(x)) - plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) - plt.xlabel('Model probability of no-answer') - plt.ylabel('Proportion of dataset') - plt.title('Histogram of no-answer probability: %s' % name) - plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) - plt.clf() - -def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): - num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) - cur_score = num_no_ans - best_score = cur_score - best_thresh = 0.0 - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): - if qid not in scores: continue - if qid_to_has_ans[qid]: - diff = scores[qid] - else: - if preds[qid]: - diff = -1 - else: - diff = 0 - cur_score += diff - if cur_score > best_score: - best_score = cur_score - best_thresh = na_probs[qid] - return 100.0 * best_score / len(scores), best_thresh - -def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans): - num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) - cur_score = num_no_ans - best_score = cur_score - best_thresh = 0.0 - qid_list = sorted(na_probs, key=lambda k: na_probs[k]) - for i, qid in enumerate(qid_list): - if qid not in scores: continue - if qid_to_has_ans[qid]: - diff = scores[qid] - else: - if preds[qid]: - diff = -1 - else: - diff = 0 - cur_score += diff - if cur_score > best_score: - best_score = cur_score - best_thresh = na_probs[qid] - - has_ans_score, has_ans_cnt = 0, 0 - for qid in qid_list: - if not qid_to_has_ans[qid]: continue - has_ans_cnt += 1 - - if qid not in scores: continue - has_ans_score += scores[qid] - - return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt - -def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): - best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) - best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - -def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): - best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans) - best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans) - main_eval['best_exact'] = best_exact - main_eval['best_exact_thresh'] = exact_thresh - main_eval['best_f1'] = best_f1 - main_eval['best_f1_thresh'] = f1_thresh - main_eval['has_ans_exact'] = has_ans_exact - main_eval['has_ans_f1'] = has_ans_f1 - -def main(OPTS): - with open(OPTS.data_file) as f: - dataset_json = json.load(f) - dataset = dataset_json['data'] - with open(OPTS.pred_file) as f: - preds = json.load(f) - if OPTS.na_prob_file: - with open(OPTS.na_prob_file) as f: - na_probs = json.load(f) - else: - na_probs = {k: 0.0 for k in preds} - qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(dataset, preds) - exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, - OPTS.na_prob_thresh) - f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, - OPTS.na_prob_thresh) - out_eval = make_eval_dict(exact_thresh, f1_thresh) - if has_ans_qids: - has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) - merge_eval(out_eval, has_ans_eval, 'HasAns') - if no_ans_qids: - no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) - merge_eval(out_eval, no_ans_eval, 'NoAns') - if OPTS.na_prob_file: - find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) - if OPTS.na_prob_file and OPTS.out_image_dir: - run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, - qid_to_has_ans, OPTS.out_image_dir) - histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') - histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') - if OPTS.out_file: - with open(OPTS.out_file, 'w') as f: - json.dump(out_eval, f) - else: - print(json.dumps(out_eval, indent=2)) - return out_eval - -if __name__ == '__main__': - OPTS = parse_args() - if OPTS.out_image_dir: - import matplotlib - matplotlib.use('Agg') - import matplotlib.pyplot as plt - main(OPTS) diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index f8449df045..0755c0ab7a 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -578,7 +578,6 @@ def compute_predictions_log_probs( output_prediction_file, output_nbest_file, output_null_log_odds_file, - orig_data_file, start_n_top, end_n_top, version_2_with_negative, @@ -756,15 +755,4 @@ def compute_predictions_log_probs( with open(output_null_log_odds_file, "w") as writer: writer.write(json.dumps(scores_diff_json, indent=4) + "\n") - with open(orig_data_file, "r", encoding='utf-8') as reader: - orig_data = json.load(reader)["data"] - - qid_to_has_ans = make_qid_to_has_ans(orig_data) - has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] - no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions) - out_eval = {} - - find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans) - - return out_eval + return all_predictions diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index bb56aa792f..3d7f832540 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -9,7 +9,7 @@ from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures from ...file_utils import is_tf_available, is_torch_available -if is_torch_available: +if is_torch_available(): import torch from torch.utils.data import TensorDataset From f230d91b437c806e3e2dad37318a5ce77d208fa6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Thu, 5 Dec 2019 21:24:57 +0100 Subject: [PATCH 052/302] check the validity of links We add a script and a CI workflow to check that all download links present in the source code are valid. --- .circleci/config.yml | 11 ++++++ utils/link_tester.py | 79 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) create mode 100644 utils/link_tester.py diff --git a/.circleci/config.yml b/.circleci/config.yml index 01e6d82b33..ebfbd79b93 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -82,6 +82,16 @@ jobs: - run: sudo pip install --progress-bar off -r docs/requirements.txt - run: sudo pip install --progress-bar off -r requirements.txt - run: ./.circleci/deploy.sh + repository_consistency: + working_directory: ~/transformers + docker: + - image: circleci/python:3.5 + resource_class: small + parallelism: 1 + steps: + - checkout + - run: sudo pip install requests + - run: python ./utils/link_tester.py workflow_filters: &workflow_filters filters: branches: @@ -91,6 +101,7 @@ workflows: version: 2 build_and_test: jobs: + - repository_consistency - build_py3_torch_and_tf - build_py3_torch - build_py3_tf diff --git a/utils/link_tester.py b/utils/link_tester.py new file mode 100644 index 0000000000..fe3990d28c --- /dev/null +++ b/utils/link_tester.py @@ -0,0 +1,79 @@ +""" Link tester. + +This little utility reads all the python files in the repository, +scans for links pointing to S3 and tests the links one by one. Raises an error +at the end of the scan if at least one link was reported broken. +""" +import os +import re +import sys + +import requests + + +REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1""" + + +def list_python_files_in_repository(): + """ List all python files in the repository. + + This function assumes that the script is executed in the root folder. + """ + source_code_files = [] + for path, subdirs, files in os.walk("."): + if "templates" in path: + continue + for name in files: + if ".py" in name and ".pyc" not in name: + path_to_files = os.path.join(path, name) + source_code_files.append(path_to_files) + + return source_code_files + + +def find_all_links(file_paths): + links = [] + for path in file_paths: + links += scan_code_for_links(path) + + return links + + +def scan_code_for_links(source): + """ Scans the file to find links using a regular expression. + Returns a list of links. + """ + with open(source, 'r') as content: + content = content.read() + raw_links = re.findall(REGEXP_FIND_S3_LINKS, content) + links = [prefix + suffix for _, prefix, suffix in raw_links] + + return links + + +def check_all_links(links): + """ Check that the provided links are valid. + + Links are considered valid if a HEAD request to the server + returns a 200 status code. + """ + broken_links = [] + for link in links: + head = requests.head(link) + if head.status_code != 200: + broken_links.append(link) + + return broken_links + + +if __name__ == "__main__": + file_paths = list_python_files_in_repository() + links = find_all_links(file_paths) + broken_links = check_all_links(links) + print("Looking for broken links to pre-trained models/configs/tokenizers...") + if broken_links: + print("The following links did not respond:") + for link in broken_links: + print("- {}".format(link)) + sys.exit(1) + print("All links are ok.") From 21451ec6ba364de78c14e7d05a55913da2809844 Mon Sep 17 00:00:00 2001 From: Philipp Glock Date: Fri, 6 Dec 2019 10:32:43 +0100 Subject: [PATCH 053/302] handle string with only whitespaces as empty --- transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 5d683629f0..bc246cc8fe 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -634,7 +634,7 @@ class PreTrainedTokenizer(object): return result def split_on_tokens(tok_list, text): - if not text: + if not text.strip(): return [] if not tok_list: return self._tokenize(text, **kwargs) From 1d87b37d100c69ff3b2c1a5dfd271b6cf777176e Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 6 Dec 2019 15:30:09 +0100 Subject: [PATCH 054/302] updating --- .../convert_pytorch_checkpoint_to_tf2.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index d20eafe2e9..2c419888e8 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -119,10 +119,10 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file tf_inputs = tf.constant(inputs_list) tfo = tf_model(tf_inputs, training=False) # build the network - pt_model = pt_model_class(config) - pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'), - strict-False) - pt_model.eval() + state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu') + pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None, + config=config, + state_dict=state_dict) pt_inputs = torch.tensor(inputs_list) with torch.no_grad(): @@ -140,7 +140,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None, - compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False): + compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False): assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory" if args_model_type is None: @@ -188,13 +188,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc if os.path.isfile(model_shortcut_name): model_shortcut_name = 'converted_model' + convert_pt_checkpoint_to_tf(model_type=model_type, pytorch_checkpoint_path=model_file, config_file=config_file, tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'), compare_with_pt_model=compare_with_pt_model) - os.remove(config_file) - os.remove(model_file) + if remove_cached_files: + os.remove(config_file) + os.remove(model_file) if __name__ == "__main__": @@ -227,6 +229,9 @@ if __name__ == "__main__": parser.add_argument("--use_cached_models", action='store_true', help = "Use cached models if possible instead of updating to latest checkpoint versions.") + parser.add_argument("--remove_cached_files", + action='store_true', + help = "Remove pytorch models after conversion (save memory when converting in batches).") parser.add_argument("--only_convert_finetuned_models", action='store_true', help = "Only convert finetuned models.") @@ -246,4 +251,5 @@ if __name__ == "__main__": config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None, compare_with_pt_model=args.compare_with_pt_model, use_cached_models=args.use_cached_models, + remove_cached_files=args.remove_cached_files, only_convert_finetuned_models=args.only_convert_finetuned_models) From 3520be7824ad11ebc05a393fd90ecfdd4203cfdb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Mon, 9 Dec 2019 11:13:09 +0100 Subject: [PATCH 055/302] create encoder attention mask from shape of hidden states We currently create encoder attention masks (when they're not provided) based on the shape of the inputs to the encoder. This is obviously wrong; sequences can be of different lengths. We now create the encoder attention mask based on the batch_size and sequence_length of the encoder hidden states. --- transformers/modeling_bert.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 1ee3e3f097..8295cf4664 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -691,17 +691,19 @@ class BertModel(BertPreTrainedModel): # If a 2D ou 3D attention mask is provided for the cross-attention # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length] - if self.config.is_decoder: + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) if encoder_attention_mask is None: - encoder_attention_mask = torch.ones(input_shape, device=device) + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) if encoder_attention_mask.dim() == 3: encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] elif encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] else: - raise ValueError("Wrong shape for input_ids (shape {}) or encoder_attention_mask (shape {})".format(input_shape, - encoder_attention_mask.shape)) + raise ValueError("Wrong shape for encoder_hidden_shape (shape {}) or encoder_attention_mask (shape {})".format(encoder_hidden_shape, + encoder_attention_mask.shape)) encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 From 169fea6855741315e2e0e15881cefc9823803aa6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 9 Dec 2019 16:25:33 +0100 Subject: [PATCH 056/302] updating T5 --- transformers/modeling_t5.py | 31 +++++++++++++------------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 1bf55611a2..104e9060fc 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -281,7 +281,7 @@ class T5Attention(nn.Module): context_position = torch.arange(qlen, dtype=torch.long)[:, None] memory_position = torch.arange(klen, dtype=torch.long)[None, :] relative_position = memory_position - context_position # shape (qlen, klen) - rp_bucket = self._relative_position_bucket(relative_position, + rp_bucket = self._relative_position_bucket(relative_position, # shape (qlen, klen) bidirectional=not self.is_decoder, num_buckets=self.relative_attention_num_buckets) values = self.relative_attention_bias(rp_bucket) # shape (qlen, klen, num_heads) @@ -337,14 +337,10 @@ class T5Attention(nn.Module): if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) + if mask is not None: + position_bias += mask # (bs, n_heads, qlen, klen) + scores += position_bias - special_out = position_bias - - if mask is not None: - scores += mask - # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) - # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) - weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) weights = F.dropout(weights, p=self.dropout, training=self.training) # (bs, n_heads, qlen, klen) @@ -362,7 +358,7 @@ class T5Attention(nn.Module): outputs = outputs + (weights,) if self.has_relative_attention_bias: outputs = outputs + (position_bias,) - return outputs + (special_out,) + return outputs class T5LayerSelfAttention(nn.Module): @@ -379,11 +375,9 @@ class T5LayerSelfAttention(nn.Module): position_bias=position_bias, head_mask=head_mask) y = attention_output[0] - special_out = attention_output[-1] - attention_output = attention_output[:-1] layer_output = hidden_states + self.dropout(y) outputs = (layer_output,) + attention_output[1:] # add attentions if we output them - return outputs + (special_out,) + return outputs class T5LayerCrossAttention(nn.Module): @@ -426,8 +420,7 @@ class T5Block(nn.Module): position_bias=position_bias, head_mask=head_mask) hidden_states = self_attention_outputs[0] - special_out = self_attention_outputs[-1] - outputs = self_attention_outputs[1:-1] # Keep self-attention outputs and relative position weights + outputs = self_attention_outputs[1:] # Keep self-attention outputs and relative position weights if not self.is_decoder: hidden_states = self.layer[1](hidden_states) @@ -442,7 +435,7 @@ class T5Block(nn.Module): hidden_states = self.layer[2](hidden_states) outputs = (hidden_states,) + outputs # add attentions if we output them - return outputs + (special_out,) # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) + return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) class T5PreTrainedModel(PreTrainedModel): @@ -536,6 +529,10 @@ class T5Stack(T5PreTrainedModel): # positions we want to attend and -1e9 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. + + # T5 has a mask that can compare sequence ids, we simulate this here with this transposistion + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2)) extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 @@ -584,8 +581,6 @@ class T5Stack(T5PreTrainedModel): encoder_attention_mask=encoder_extended_attention_mask, encoder_decoder_position_bias=encoder_decoder_position_bias, head_mask=head_mask[i]) - if i == 0: - special_out = layer_outputs[-1] # layer_outputs is a tuple with: # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) hidden_states = layer_outputs[0] @@ -610,7 +605,7 @@ class T5Stack(T5PreTrainedModel): outputs = outputs + (all_hidden_states,) if self.output_attentions: outputs = outputs + (all_attentions,) - return outputs + (special_out,) # last-layer hidden state, (all hidden states), (all attentions) + return outputs # last-layer hidden state, (all hidden states), (all attentions) T5_START_DOCSTRING = r""" The T5 model was proposed in From 2a4ef098d65939d436e2a5efbb518fb807b6b1b6 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Mon, 9 Dec 2019 10:46:47 -0500 Subject: [PATCH 057/302] Add ALBERT and XLM to SQuAD script --- examples/run_squad.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index a8ac1d1b05..2df29014ef 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -44,7 +44,9 @@ from transformers import (WEIGHTS_NAME, BertConfig, XLNetForQuestionAnswering, XLNetTokenizer, DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer, - AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) + AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer, + XLMConfig, XLMForQuestionAnswering, XLMTokenizer, + ) from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_examples_to_features @@ -58,7 +60,8 @@ MODEL_CLASSES = { 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer) + 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), + 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer) } def set_seed(args): From b016dd16c90c2c18168d13bca6d5002729fd5b0a Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 9 Dec 2019 21:38:07 +0100 Subject: [PATCH 058/302] fix tests on python 3.5 --- transformers/modeling_t5.py | 2 +- transformers/tests/modeling_common_test.py | 15 ++++++++------- transformers/tokenization_t5.py | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 104e9060fc..e48293b49e 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -338,7 +338,7 @@ class T5Attention(nn.Module): raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) if mask is not None: - position_bias += mask # (bs, n_heads, qlen, klen) + position_bias = position_bias + mask # (bs, n_heads, qlen, klen) scores += position_bias weights = F.softmax(scores.float(), dim=-1).type_as(scores) # (bs, n_heads, qlen, klen) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index ee75da605c..11aeaafe31 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -138,8 +138,8 @@ class CommonTestCases: self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, + self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length]) out_len = len(outputs) if self.is_encoder_decoder: @@ -151,8 +151,8 @@ class CommonTestCases: self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length, + self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length]) # Check attention is always last and order is fine config.output_attentions = True @@ -169,8 +169,8 @@ class CommonTestCases: self.assertListEqual( list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, + self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length]) def test_torchscript(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() @@ -440,7 +440,8 @@ class CommonTestCases: self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1) self.assertListEqual( list(hidden_states[0].shape[-2:]), - [self.model_tester.seq_length, self.model_tester.hidden_size]) + [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, + self.model_tester.hidden_size]) def test_resize_tokens_embeddings(self): original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 3847aeefbf..933084d13a 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -134,7 +134,7 @@ class T5Tokenizer(PreTrainedTokenizer): """ Converts a token (str/unicode) in an id using the vocab. """ if token.startswith(u"', token) - num = int(l[1]) + num = int(l.group(1)) return self.vocab_size - num - 1 return self.sp_model.piece_to_id(token) From 808bb8da7edbd9f5858b3c223ebac9bd83275934 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 9 Dec 2019 21:48:34 +0100 Subject: [PATCH 059/302] fix transfo xl tests --- transformers/tests/modeling_common_test.py | 18 ++++++++++++------ .../tests/modeling_tf_transfo_xl_test.py | 2 +- transformers/tests/modeling_transfo_xl_test.py | 2 +- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 11aeaafe31..7033a06d0b 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -125,6 +125,11 @@ class CommonTestCases: def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length + encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length + decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length + encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length + for model_class in self.all_model_classes: config.output_attentions = True config.output_hidden_states = False @@ -138,8 +143,8 @@ class CommonTestCases: self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, - self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length]) + encoder_seq_length , + encoder_key_length]) out_len = len(outputs) if self.is_encoder_decoder: @@ -151,8 +156,9 @@ class CommonTestCases: self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length, - self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length]) + decoder_seq_length, + decoder_key_length + ]) # Check attention is always last and order is fine config.output_attentions = True @@ -169,8 +175,8 @@ class CommonTestCases: self.assertListEqual( list(self_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length, - self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length]) + encoder_seq_length, + encoder_key_length]) def test_torchscript(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py index 534fe39646..8ebd749b4c 100644 --- a/transformers/tests/modeling_tf_transfo_xl_test.py +++ b/transformers/tests/modeling_tf_transfo_xl_test.py @@ -68,7 +68,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): self.batch_size = batch_size self.seq_length = seq_length self.mem_len = mem_len - self.key_len = seq_length + mem_len + self.key_length = seq_length + mem_len self.clamp_len = clamp_len self.is_training = is_training self.use_labels = use_labels diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py index f7b913da5b..2d1541d87b 100644 --- a/transformers/tests/modeling_transfo_xl_test.py +++ b/transformers/tests/modeling_transfo_xl_test.py @@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): self.batch_size = batch_size self.seq_length = seq_length self.mem_len = mem_len - self.key_len = seq_length + mem_len + self.key_length = seq_length + mem_len self.clamp_len = clamp_len self.is_training = is_training self.use_labels = use_labels From 8e651f56b75982f07fc522b62f298d8d70e6e56f Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 9 Dec 2019 22:13:57 +0100 Subject: [PATCH 060/302] fix tf tests --- transformers/tests/modeling_tf_common_test.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index 20ccfd8ce0..26bd037c9e 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -213,6 +213,11 @@ class TFCommonTestCases: def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length + encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length + decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length + encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length + for model_class in self.all_model_classes: config.output_attentions = True config.output_hidden_states = False @@ -225,8 +230,8 @@ class TFCommonTestCases: self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + encoder_seq_length, + encoder_key_length]) out_len = len(outputs) if self.is_encoder_decoder: @@ -238,8 +243,8 @@ class TFCommonTestCases: self.assertListEqual( list(decoder_attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + decoder_seq_length, + decoder_key_length]) # Check attention is always last and order is fine config.output_attentions = True @@ -255,8 +260,8 @@ class TFCommonTestCases: self.assertListEqual( list(attentions[0].shape[-3:]), [self.model_tester.num_attention_heads, - self.model_tester.seq_length, - self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length]) + encoder_seq_length, + encoder_key_length]) def test_headmasking(self): pass From 608a8f5b567f81f3cc997a195496dd8bf1c28158 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 10:01:01 +0100 Subject: [PATCH 061/302] updating tf 2.0 layer_norm to T5 layer norm --- transformers/modeling_tf_t5.py | 43 ++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 13 deletions(-) diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index c1de4745c2..11762ee1e5 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -17,16 +17,11 @@ from __future__ import absolute_import, division, print_function, unicode_literals -import json import logging import math -import os -import sys import copy import itertools -from io import open -import numpy as np import tensorflow as tf from .configuration_t5 import T5Config @@ -45,6 +40,28 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model) #################################################### +class TFT5LayerNorm(tf.keras.layers.Layer): + def __init__(self, epsilon=1e-6, **kwargs): + """ Construct a layernorm module in the T5 style + No bias and no substraction of mean. + """ + super(TFT5LayerNorm, self).__init__(**kwargs) + self.variance_epsilon = epsilon + + def build(self, input_shape): + """Build shared word embedding layer """ + self.weight = self.add_weight( + "weight", + shape=(input_shape[-1],), + initializer='ones') + super(TFT5LayerNorm, self).build(input_shape) + + def call(self, x): + variance = tf.math.reduce_min(tf.math.square(x), axis=-1, keepdims=True) + x = x * tf.math.rsqrt(variance + self.variance_epsilon) + return self.weight * x + + class TFT5DenseReluDense(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFT5DenseReluDense, self).__init__(**kwargs) @@ -65,8 +82,8 @@ class TFT5LayerFF(tf.keras.layers.Layer): def __init__(self, config, **kwargs): super(TFT5LayerFF, self).__init__(**kwargs) self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense') - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, + name='layer_norm') self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, training=False): @@ -249,8 +266,8 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer): self.SelfAttention = TFT5Attention(config, has_relative_attention_bias=has_relative_attention_bias, name='SelfAttention') - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, + name='layer_norm') self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, attention_mask=None, position_bias=None, @@ -273,8 +290,8 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer): self.EncDecAttention = TFT5Attention(config, has_relative_attention_bias=has_relative_attention_bias, name='EncDecAttention') - self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, - name='layer_norm') + self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, + name='layer_norm') self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def call(self, hidden_states, kv, attention_mask=None, position_bias=None, @@ -353,8 +370,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): has_relative_attention_bias=bool(i == 0), name='block_._{}'.format(i)) for i in range(config.num_layers)] - self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, - name='final_layer_norm') + self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon, + name='final_layer_norm') self.dropout = tf.keras.layers.Dropout(config.dropout_rate) def _resize_token_embeddings(self, new_num_tokens): From 72c36b9ea2e43d017d3aa5520d09f55d8ec19025 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 16 Oct 2019 14:17:58 +0200 Subject: [PATCH 062/302] [WIP] - CLI --- setup.py | 8 +- transformers-cli | 5 +- transformers/__init__.py | 2 + transformers/__main__.py | 145 ++++-------------- transformers/commands/convert.py | 115 ++++++++++++++ transformers/commands/serving.py | 176 ++++++++++++++++++++++ transformers/commands/train.py | 121 +++++++++++++++ transformers/data/__init__.py | 2 +- transformers/data/processors/__init__.py | 2 +- transformers/data/processors/utils.py | 182 +++++++++++++++++++++++ 10 files changed, 631 insertions(+), 127 deletions(-) create mode 100644 transformers/commands/convert.py create mode 100644 transformers/commands/serving.py create mode 100644 transformers/commands/train.py diff --git a/setup.py b/setup.py index c4af32df83..0b7e512955 100644 --- a/setup.py +++ b/setup.py @@ -62,15 +62,15 @@ setup( 'regex', 'sentencepiece', 'sacremoses'], + extras_require=extras, + scripts=[ + 'transformers-cli' + ], entry_points={ 'console_scripts': [ "transformers=transformers.__main__:main", ] }, - extras_require=extras, - scripts=[ - 'transformers-cli' - ], # python_requires='>=3.5.0', classifiers=[ 'Intended Audience :: Science/Research', diff --git a/transformers-cli b/transformers-cli index ef00d15aa3..7b0905d4b4 100644 --- a/transformers-cli +++ b/transformers-cli @@ -1,14 +1,15 @@ #!/usr/bin/env python from argparse import ArgumentParser +from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands - if __name__ == '__main__': - parser = ArgumentParser(description='Transformers CLI tool', usage='transformers-cli []') + parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') commands_parser = parser.add_subparsers(help='transformers-cli command helpers') # Register commands + ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) # Let's go diff --git a/transformers/__init__.py b/transformers/__init__.py index f9a28add5f..a71a291a44 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -24,6 +24,8 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, + SingleSentenceClassificationProcessor, + convert_examples_to_features, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, diff --git a/transformers/__main__.py b/transformers/__main__.py index 31dbd24908..a6e9ae65e0 100644 --- a/transformers/__main__.py +++ b/transformers/__main__.py @@ -1,129 +1,36 @@ # coding: utf8 + def main(): import sys - if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]: + if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]: print( - "This command line utility let you convert original (author released) model checkpoint to pytorch.\n" - "It should be used as one of: \n" - ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n" - ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n" - ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n" - ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n" - ">> transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n" - ">> transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT") - else: - if sys.argv[1] == "bert": - try: - from .convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch - except ImportError: - print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " - "In that case, it requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") - raise + "First argument to `transformers` command line interface should be one of: \n" + ">> convert serve train predict") + if sys.argv[1] == "convert": + from transformers.commands import convert + convert(sys.argv) + elif sys.argv[1] == "train": + from transformers.commands import train + train(sys.argv) + elif sys.argv[1] == "serve": + pass + # from argparse import ArgumentParser + # from transformers.commands.serving import ServeCommand + # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve []') + # commands_parser = parser.add_subparsers(help='transformers-cli command helpers') - if len(sys.argv) != 5: - # pylint: disable=line-too-long - print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`") - else: - PYTORCH_DUMP_OUTPUT = sys.argv.pop() - TF_CONFIG = sys.argv.pop() - TF_CHECKPOINT = sys.argv.pop() - convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) - elif sys.argv[1] == "gpt": - from .convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch - if len(sys.argv) < 4 or len(sys.argv) > 5: - # pylint: disable=line-too-long - print("Should be used as `transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`") - else: - OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2] - PYTORCH_DUMP_OUTPUT = sys.argv[3] - if len(sys.argv) == 5: - OPENAI_GPT_CONFIG = sys.argv[4] - else: - OPENAI_GPT_CONFIG = "" - convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH, - OPENAI_GPT_CONFIG, - PYTORCH_DUMP_OUTPUT) - elif sys.argv[1] == "transfo_xl": - try: - from .convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch - except ImportError: - print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " - "In that case, it requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") - raise - if len(sys.argv) < 4 or len(sys.argv) > 5: - # pylint: disable=line-too-long - print("Should be used as `transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") - else: - if 'ckpt' in sys.argv[2].lower(): - TF_CHECKPOINT = sys.argv[2] - TF_DATASET_FILE = "" - else: - TF_DATASET_FILE = sys.argv[2] - TF_CHECKPOINT = "" - PYTORCH_DUMP_OUTPUT = sys.argv[3] - if len(sys.argv) == 5: - TF_CONFIG = sys.argv[4] - else: - TF_CONFIG = "" - convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE) - elif sys.argv[1] == "gpt2": - try: - from .convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch - except ImportError: - print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " - "In that case, it requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") - raise + # # Register commands + # ServeCommand.register_subcommand(commands_parser) - if len(sys.argv) < 4 or len(sys.argv) > 5: - # pylint: disable=line-too-long - print("Should be used as `transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`") - else: - TF_CHECKPOINT = sys.argv[2] - PYTORCH_DUMP_OUTPUT = sys.argv[3] - if len(sys.argv) == 5: - TF_CONFIG = sys.argv[4] - else: - TF_CONFIG = "" - convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT) - elif sys.argv[1] == "xlnet": - try: - from .convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch - except ImportError: - print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " - "In that case, it requires TensorFlow to be installed. Please see " - "https://www.tensorflow.org/install/ for installation instructions.") - raise + # # Let's go + # args = parser.parse_args() - if len(sys.argv) < 5 or len(sys.argv) > 6: - # pylint: disable=line-too-long - print("Should be used as `transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`") - else: - TF_CHECKPOINT = sys.argv[2] - TF_CONFIG = sys.argv[3] - PYTORCH_DUMP_OUTPUT = sys.argv[4] - if len(sys.argv) == 6: - FINETUNING_TASK = sys.argv[5] - else: - FINETUNING_TASK = None - - convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT, - TF_CONFIG, - PYTORCH_DUMP_OUTPUT, - FINETUNING_TASK) - elif sys.argv[1] == "xlm": - from .convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch - - if len(sys.argv) != 4: - # pylint: disable=line-too-long - print("Should be used as `transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`") - else: - XLM_CHECKPOINT_PATH = sys.argv[2] - PYTORCH_DUMP_OUTPUT = sys.argv[3] - - convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT) + # if not hasattr(args, 'func'): + # parser.print_help() + # exit(1) + # # Run + # service = args.func(args) + # service.run() if __name__ == '__main__': main() diff --git a/transformers/commands/convert.py b/transformers/commands/convert.py new file mode 100644 index 0000000000..55dbf53734 --- /dev/null +++ b/transformers/commands/convert.py @@ -0,0 +1,115 @@ +from argparse import ArgumentParser, Namespace + +from logging import getLogger + +from transformers import AutoModel, AutoTokenizer +from transformers.commands import BaseTransformersCLICommand + + +def convert_command_factory(args: Namespace): + """ + Factory function used to convert a model TF 1.0 checkpoint in a PyTorch checkpoint. + :return: ServeCommand + """ + return ConvertCommand(args.model_type, args.tf_checkpoint, args.pytorch_dump_output, + args.config, args.finetuning_task_name) + + +class ConvertCommand(BaseTransformersCLICommand): + + @staticmethod + def register_subcommand(parser: ArgumentParser): + """ + Register this command to argparse so it's available for the transformer-cli + :param parser: Root parser to register command-specific arguments + :return: + """ + train_parser = parser.add_parser('convert', help="CLI tool to run convert model from original " + "author checkpoints to Transformesr PyTorch checkpoints.") + train_parser.add_argument('--model_type', type=str, required=True, + help='Model\'s type.') + train_parser.add_argument('--tf_checkpoint', type=str, required=True, + help='TensorFlow checkpoint path or folder.') + train_parser.add_argument('--pytorch_dump_output', type=str, required=True, + help='Path to the PyTorch savd model output.') + train_parser.add_argument('--config', type=str, default="", + help='Configuration file path or folder.') + train_parser.add_argument('--finetuning_task_name', type=str, default=None, + help='Optional fine-tuning task name if the TF model was a finetuned model.') + train_parser.set_defaults(func=convert_command_factory) + + def __init__(self, model_type: str, tf_checkpoint: str, pytorch_dump_output: str, + config: str, finetuning_task_name: str, *args): + self._logger = getLogger('transformers-cli/converting') + + self._logger.info('Loading model {}'.format(model_type)) + self._model_type = model_type + self._tf_checkpoint = tf_checkpoint + self._pytorch_dump_output = pytorch_dump_output + self._config = config + self._finetuning_task_name = finetuning_task_name + + def run(self): + if self._model_type == "bert": + try: + from transformers.convert_bert_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch + except ImportError: + msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ + "In that case, it requires TensorFlow to be installed. Please see " \ + "https://www.tensorflow.org/install/ for installation instructions." + raise ImportError(msg) + + convert_tf_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) + elif self._model_type == "gpt": + from transformers.convert_openai_original_tf_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch + convert_openai_checkpoint_to_pytorch(self._tf_checkpoint, + self._config, + self._pytorch_dump_output) + elif self._model_type == "transfo_xl": + try: + from transformers.convert_transfo_xl_original_tf_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch + except ImportError: + msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ + "In that case, it requires TensorFlow to be installed. Please see " \ + "https://www.tensorflow.org/install/ for installation instructions." + raise ImportError(msg) + + if 'ckpt' in self._tf_checkpoint.lower(): + TF_CHECKPOINT = self._tf_checkpoint + TF_DATASET_FILE = "" + else: + TF_DATASET_FILE = self._tf_checkpoint + TF_CHECKPOINT = "" + convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, + self._config, + self._pytorch_dump_output, + TF_DATASET_FILE) + elif self._model_type == "gpt2": + try: + from transformers.convert_gpt2_original_tf_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch + except ImportError: + msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ + "In that case, it requires TensorFlow to be installed. Please see " \ + "https://www.tensorflow.org/install/ for installation instructions." + raise ImportError(msg) + + convert_gpt2_checkpoint_to_pytorch(self._tf_checkpoint, self._config, self._pytorch_dump_output) + elif self._model_type == "xlnet": + try: + from transformers.convert_xlnet_original_tf_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch + except ImportError: + msg = "transformers can only be used from the commandline to convert TensorFlow models in PyTorch, " \ + "In that case, it requires TensorFlow to be installed. Please see " \ + "https://www.tensorflow.org/install/ for installation instructions." + raise ImportError(msg) + + convert_xlnet_checkpoint_to_pytorch(self._tf_checkpoint, + self._config, + self._pytorch_dump_output, + self._finetuning_task_name) + elif self._model_type == "xlm": + from transformers.convert_xlm_original_pytorch_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch + + convert_xlm_checkpoint_to_pytorch(self._tf_checkpoint, self._pytorch_dump_output) + else: + raise ValueError("--model_type should be selected in the list [bert, gpt, gpt2, transfo_xl, xlnet, xlm]") diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py new file mode 100644 index 0000000000..0b47246ead --- /dev/null +++ b/transformers/commands/serving.py @@ -0,0 +1,176 @@ +from argparse import ArgumentParser, Namespace +from typing import List, Optional, Union, Any + +import torch +from fastapi import FastAPI, HTTPException, Body +from logging import getLogger + +from pydantic import BaseModel +from uvicorn import run + +from transformers import AutoModel, AutoTokenizer, AutoConfig +from transformers.commands import BaseTransformersCLICommand + + +def serve_command_factory(args: Namespace): + """ + Factory function used to instantiate serving server from provided command line arguments. + :return: ServeCommand + """ + return ServeCommand(args.host, args.port, args.model, args.graphql) + + +class ServeResult(BaseModel): + """ + Base class for serving result + """ + model: str + + +class ServeModelInfoResult(ServeResult): + """ + Expose model information + """ + infos: dict + + +class ServeTokenizeResult(ServeResult): + """ + Tokenize result model + """ + tokens: List[str] + tokens_ids: Optional[List[int]] + + +class ServeDeTokenizeResult(ServeResult): + """ + DeTokenize result model + """ + text: str + + +class ServeForwardResult(ServeResult): + """ + Forward result model + """ + tokens: List[str] + tokens_ids: List[int] + output: Any + + +class ServeCommand(BaseTransformersCLICommand): + + @staticmethod + def register_subcommand(parser: ArgumentParser): + """ + Register this command to argparse so it's available for the transformer-cli + :param parser: Root parser to register command-specific arguments + :return: + """ + serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.') + serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.') + serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.') + serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model to infer from.') + serve_parser.add_argument('--graphql', action='store_true', default=False, help='Enable GraphQL endpoints.') + serve_parser.set_defaults(func=serve_command_factory) + + def __init__(self, host: str, port: int, model: str, graphql: bool): + self._logger = getLogger('transformers-cli/serving') + + self._logger.info('Loading model {}'.format(model)) + self._model_name = model + self._model = AutoModel.from_pretrained(model) + self._tokenizer = AutoTokenizer.from_pretrained(model) + + self._logger.info('Serving model over {}:{}'.format(host, port)) + self._host = host + self._port = port + self._app = FastAPI() + + # Register routes + self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET']) + self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST']) + self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST']) + self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST']) + + def run(self): + run(self._app, host=self._host, port=self._port) + + def model_info(self): + return ServeModelInfoResult(model=self._model_name, infos=vars(self._model.config)) + + def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): + """ + Tokenize the provided input and eventually returns corresponding tokens id: + - **text_input**: String to tokenize + - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping. + """ + try: + tokens_txt = self._tokenizer.tokenize(text_input) + + if return_ids: + tokens_ids = self._tokenizer.convert_tokens_to_ids(tokens_txt) + return ServeTokenizeResult(model=self._model_name, tokens=tokens_txt, tokens_ids=tokens_ids) + else: + return ServeTokenizeResult(model=self._model_name, tokens=tokens_txt) + + except Exception as e: + raise HTTPException(status_code=500, detail={"model": self._model_name, "error": str(e)}) + + def detokenize(self, tokens_ids: List[int] = Body(None, embed=True), + skip_special_tokens: bool = Body(False, embed=True), + cleanup_tokenization_spaces: bool = Body(True, embed=True)): + """ + Detokenize the provided tokens ids to readable text: + - **tokens_ids**: List of tokens ids + - **skip_special_tokens**: Flag indicating to not try to decode special tokens + - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones. + """ + try: + decoded_str = self._tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) + return ServeDeTokenizeResult(model=self._model_name, text=decoded_str) + except Exception as e: + raise HTTPException(status_code=500, detail={"model": self._model_name, "error": str(e)}) + + def forward(self, inputs: Union[str, List[str], List[int]] = Body(None, embed=True), + attention_mask: Optional[List[int]] = Body(None, embed=True), + tokens_type_ids: Optional[List[int]] = Body(None, embed=True)): + """ + **inputs**: + **attention_mask**: + **tokens_type_ids**: + """ + + # Check we don't have empty string + if len(inputs) == 0: + return ServeForwardResult(model=self._model_name, output=[], attention=[]) + + if isinstance(inputs, str): + inputs_tokens = self._tokenizer.tokenize(inputs) + inputs_ids = self._tokenizer.convert_tokens_to_ids(inputs_tokens) + + elif isinstance(inputs, List): + if isinstance(inputs[0], str): + inputs_tokens = inputs + inputs_ids = self._tokenizer.convert_tokens_to_ids(inputs_tokens) + elif isinstance(inputs[0], int): + inputs_tokens = [] + inputs_ids = inputs + else: + error_msg = "inputs should be string, [str] of [int] (got {})".format(type(inputs[0])) + raise HTTPException(423, detail={"error": error_msg}) + else: + error_msg = "inputs should be string, [str] of [int] (got {})".format(type(inputs)) + raise HTTPException(423, detail={"error": error_msg}) + + try: + # Forward through the model + t_input_ids = torch.tensor(inputs_ids).unsqueeze(0) + output = self._model(t_input_ids, attention_mask, tokens_type_ids) + + return ServeForwardResult( + model=self._model_name, tokens=inputs_tokens, + tokens_ids=inputs_ids, output=output[0].tolist() + ) + except Exception as e: + raise HTTPException(500, {"error": str(e)}) diff --git a/transformers/commands/train.py b/transformers/commands/train.py new file mode 100644 index 0000000000..7fb3a54d25 --- /dev/null +++ b/transformers/commands/train.py @@ -0,0 +1,121 @@ +from argparse import ArgumentParser, Namespace + +from logging import getLogger + +from transformers.commands import BaseTransformersCLICommand +from transformers import (AutoTokenizer, is_tf_available, is_torch_available, + SingleSentenceClassificationProcessor, + convert_examples_to_features) +if is_tf_available(): + from transformers import TFAutoModelForSequenceClassification as SequenceClassifModel +elif is_torch_available(): + from transformers import AutoModelForSequenceClassification as SequenceClassifModel +else: + raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") + +# TF training parameters +BATCH_SIZE = 32 +EVAL_BATCH_SIZE = BATCH_SIZE * 2 +USE_XLA = False +USE_AMP = False + +def train_command_factory(args: Namespace): + """ + Factory function used to instantiate serving server from provided command line arguments. + :return: ServeCommand + """ + return TrainCommand(args.model) + + +class TrainCommand(BaseTransformersCLICommand): + + @staticmethod + def register_subcommand(parser: ArgumentParser): + """ + Register this command to argparse so it's available for the transformer-cli + :param parser: Root parser to register command-specific arguments + :return: + """ + train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.') + train_parser.add_argument('--train_data', type=str, required=True, + help='path to train (and optionally evaluation) dataset.') + train_parser.add_argument('--task', type=str, default='text_classification', + help='Task to train the model on.') + train_parser.add_argument('--model', type=str, default='bert-base-uncased', + help='Model\'s name or path to stored model.') + train_parser.add_argument('--valid_data', type=str, default='', + help='path to validation dataset.') + train_parser.add_argument('--valid_data_ratio', type=float, default=0.1, + help="if validation dataset is not provided, fraction of train dataset " + "to use as validation dataset.") + train_parser.set_defaults(func=train_command_factory) + + def __init__(self, model_name: str, task: str, train_data: str, + valid_data: str, valid_data_ratio: float): + self._logger = getLogger('transformers-cli/training') + + self._framework = 'tf' if is_tf_available() else 'torch' + + self._logger.info('Loading model {}'.format(model_name)) + self._model_name = model_name + self._tokenizer = AutoTokenizer.from_pretrained(model_name) + if task == 'text_classification': + self._model = SequenceClassifModel.from_pretrained(model_name) + elif task == 'token_classification': + raise NotImplementedError + elif task == 'question_answering': + raise NotImplementedError + + dataset = SingleSentenceClassificationProcessor.create_from_csv(train_data) + num_data_samples = len(SingleSentenceClassificationProcessor) + if valid_data: + self._train_dataset = dataset + self._num_train_samples = num_data_samples + self._valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(valid_data) + self._num_valid_samples = len(self._valid_dataset) + else: + assert 0.0 < valid_data_ratio < 1.0, "--valid_data_ratio should be between 0.0 and 1.0" + self._num_valid_samples = num_data_samples * valid_data_ratio + self._num_train_samples = num_data_samples - self._num_valid_samples + self._train_dataset = dataset[self._num_train_samples] + self._valid_dataset = dataset[self._num_valid_samples] + + def run(self): + if self._framework == 'tf': + return self.run_tf() + return self.run_torch() + + def run_torch(self): + raise NotImplementedError + + def run_tf(self): + import tensorflow as tf + + tf.config.optimizer.set_jit(USE_XLA) + tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) + + # Prepare dataset as a tf.train_data.Dataset instance + train_dataset = convert_examples_to_features(self._train_dataset, self._tokenizer, mode='sequence_classification') + valid_dataset = convert_examples_to_features(self._valid_dataset, self._tokenizer, mode='sequence_classification') + train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) + valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) + + # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule + opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) + if USE_AMP: + # loss scaling is currently required when using mixed precision + opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + model.compile(optimizer=opt, loss=loss, metrics=[metric]) + + # Train and evaluate using tf.keras.Model.fit() + train_steps = train_examples//BATCH_SIZE + valid_steps = valid_examples//EVAL_BATCH_SIZE + + history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, + validation_data=valid_dataset, validation_steps=valid_steps) + + # Save TF2 model + os.makedirs('./save/', exist_ok=True) + model.save_pretrained('./save/') diff --git a/transformers/data/__init__.py b/transformers/data/__init__.py index 270a053268..5567952fd2 100644 --- a/transformers/data/__init__.py +++ b/transformers/data/__init__.py @@ -1,4 +1,4 @@ -from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures +from .processors import InputExample, InputFeatures, DataProcessor, SquadFeatures, SingleSentenceClassificationProcessor from .processors import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .processors import squad_convert_examples_to_features, SquadExample, SquadV1Processor, SquadV2Processor from .processors import xnli_output_modes, xnli_processors, xnli_tasks_num_labels diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 0f1b24893a..0cef0080f4 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ -from .utils import InputExample, InputFeatures, DataProcessor +from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor, convert_examples_to_features from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels \ No newline at end of file diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index 07bdf3150c..39544a1239 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -125,3 +125,185 @@ class DataProcessor(object): line = list(unicode(cell, 'utf-8') for cell in line) lines.append(line) return lines + + +class SingleSentenceClassificationProcessor(DataProcessor): + """ Generic processor for a single sentence classification data set.""" + def __init__(self, labels=None, examples=None): + self.labels = [] if labels is None else labels + self.examples = [] if examples is None else examples + + @classmethod + def create_from_csv(cls, file_name): + processor = cls() + processor.add_examples_from_csv(file_name) + return processor + + def __len__(self): + return len(self.examples) + + def __getitem__(self, idx): + if isinstance(idx, slice): + return SingleSentenceClassificationProcessor(labels=self.labels, + examples=self.examples[idx]) + return self.examples[idx] + + def get_labels(self): + """Gets the list of labels for this data set.""" + return self.labels + + def add_examples_from_csv(self, file_name): + lines = self._read_tsv(file_name) + self.add_examples_from_lines(lines) + + def add_examples_from_lines(self, lines, split_name='', overwrite_labels=False, overwrite_examples=False): + """Creates examples for the training and dev sets.""" + added_labels = set() + examples = [] + for (i, line) in enumerate(lines): + if len(line) > 2: + guid = "%s-%s" % (split_name, line[0]) if split_name else line[0] + label = line[1] + text_a = line[2] + else: + guid = "%s-%s" % (split_name, i) if split_name else "%s" % i + label = line[0] + text_a = line[1] + + added_labels.add(label) + examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + + # Update examples + if overwrite_examples: + self.examples = examples + else: + self.examples.extend(examples) + + # Update labels + if overwrite_labels: + self.labels = list(added_labels) + else: + self.labels = list(set(self.labels).union(added_labels)) + + return self.examples + + +def convert_examples_to_features(examples, tokenizer, + mode='sequence_classification', + max_length=512, + pad_on_left=False, + pad_token=0, + pad_token_segment_id=0, + mask_padding_with_zero=True): + """ + Loads a data file into a list of ``InputFeatures`` + + Args: + examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) + + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. + + """ + is_tf_dataset = False + if is_tf_available() and isinstance(examples, tf.data.Dataset): + is_tf_dataset = True + + if task is not None: + processor = glue_processors[task]() + if label_list is None: + label_list = processor.get_labels() + logger.info("Using label list %s for task %s" % (label_list, task)) + if output_mode is None: + output_mode = glue_output_modes[task] + logger.info("Using output mode %s for task %s" % (output_mode, task)) + + label_map = {label: i for i, label in enumerate(label_list)} + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + logger.info("Writing example %d" % (ex_index)) + if is_tf_dataset: + example = processor.get_example_from_tensor_dict(example) + + inputs = tokenizer.encode_plus( + example.text_a, + example.text_b, + add_special_tokens=True, + max_length=max_length, + ) + input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + + # Zero-pad up to the sequence length. + padding_length = max_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) + + assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) + assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) + assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) + + if output_mode == "classification": + label = label_map[example.label] + elif output_mode == "regression": + label = float(example.label) + else: + raise KeyError(output_mode) + + if ex_index < 5: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) + logger.info("label: %s (id = %d)" % (example.label, label)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + label=label)) + + if is_tf_available() and is_tf_dataset: + def gen(): + for ex in features: + yield ({'input_ids': ex.input_ids, + 'attention_mask': ex.attention_mask, + 'token_type_ids': ex.token_type_ids}, + ex.label) + + return tf.data.Dataset.from_generator(gen, + ({'input_ids': tf.int32, + 'attention_mask': tf.int32, + 'token_type_ids': tf.int32}, + tf.int64), + ({'input_ids': tf.TensorShape([None]), + 'attention_mask': tf.TensorShape([None]), + 'token_type_ids': tf.TensorShape([None])}, + tf.TensorShape([]))) + + return features From 2d8559731acbf673fe3e31aaeb17412a342cff73 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 16 Oct 2019 23:19:45 +0200 Subject: [PATCH 063/302] add pipeline - train --- transformers/commands/train.py | 127 +++++++---- transformers/data/processors/utils.py | 313 +++++++++++++------------- transformers/pipeline.py | 254 +++++++++++++++++++++ 3 files changed, 485 insertions(+), 209 deletions(-) create mode 100644 transformers/pipeline.py diff --git a/transformers/commands/train.py b/transformers/commands/train.py index 7fb3a54d25..fc89d48594 100644 --- a/transformers/commands/train.py +++ b/transformers/commands/train.py @@ -1,5 +1,5 @@ +import os from argparse import ArgumentParser, Namespace - from logging import getLogger from transformers.commands import BaseTransformersCLICommand @@ -14,8 +14,6 @@ else: raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") # TF training parameters -BATCH_SIZE = 32 -EVAL_BATCH_SIZE = BATCH_SIZE * 2 USE_XLA = False USE_AMP = False @@ -24,7 +22,7 @@ def train_command_factory(args: Namespace): Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ - return TrainCommand(args.model) + return TrainCommand(args) class TrainCommand(BaseTransformersCLICommand): @@ -38,50 +36,84 @@ class TrainCommand(BaseTransformersCLICommand): """ train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.') train_parser.add_argument('--train_data', type=str, required=True, - help='path to train (and optionally evaluation) dataset.') + help="path to train (and optionally evaluation) dataset as a csv with " + "tab separated labels and sentences.") + + train_parser.add_argument('--column_label', type=int, default=0, + help='Column of the dataset csv file with example labels.') + train_parser.add_argument('--column_text', type=int, default=1, + help='Column of the dataset csv file with example texts.') + train_parser.add_argument('--column_id', type=int, default=2, + help='Column of the dataset csv file with example ids.') + + train_parser.add_argument('--validation_data', type=str, default='', + help='path to validation dataset.') + train_parser.add_argument('--validation_split', type=float, default=0.1, + help="if validation dataset is not provided, fraction of train dataset " + "to use as validation dataset.") + + train_parser.add_argument('--output', type=str, default='./', + help='path to saved the trained model.') + train_parser.add_argument('--task', type=str, default='text_classification', help='Task to train the model on.') train_parser.add_argument('--model', type=str, default='bert-base-uncased', help='Model\'s name or path to stored model.') - train_parser.add_argument('--valid_data', type=str, default='', - help='path to validation dataset.') - train_parser.add_argument('--valid_data_ratio', type=float, default=0.1, - help="if validation dataset is not provided, fraction of train dataset " - "to use as validation dataset.") + train_parser.add_argument('--train_batch_size', type=int, default=32, + help='Batch size for training.') + train_parser.add_argument('--valid_batch_size', type=int, default=64, + help='Batch size for validation.') + train_parser.add_argument('--learning_rate', type=float, default=3e-5, + help="Learning rate.") + train_parser.add_argument('--adam_epsilon', type=float, default=1e-08, + help="Epsilon for Adam optimizer.") train_parser.set_defaults(func=train_command_factory) - def __init__(self, model_name: str, task: str, train_data: str, - valid_data: str, valid_data_ratio: float): - self._logger = getLogger('transformers-cli/training') + def __init__(self, args: Namespace): + self.logger = getLogger('transformers-cli/training') - self._framework = 'tf' if is_tf_available() else 'torch' + self.framework = 'tf' if is_tf_available() else 'torch' - self._logger.info('Loading model {}'.format(model_name)) - self._model_name = model_name - self._tokenizer = AutoTokenizer.from_pretrained(model_name) - if task == 'text_classification': - self._model = SequenceClassifModel.from_pretrained(model_name) - elif task == 'token_classification': + os.makedirs(args.output) + self.output = args.output + + self.column_label = args.column_label + self.column_text = args.column_text + self.column_id = args.column_id + + self.logger.info('Loading model {}'.format(args.model_name)) + self.model_name = args.model_name + self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + if args.task == 'text_classification': + self.model = SequenceClassifModel.from_pretrained(args.model_name) + elif args.task == 'token_classification': raise NotImplementedError - elif task == 'question_answering': + elif args.task == 'question_answering': raise NotImplementedError - dataset = SingleSentenceClassificationProcessor.create_from_csv(train_data) - num_data_samples = len(SingleSentenceClassificationProcessor) - if valid_data: - self._train_dataset = dataset - self._num_train_samples = num_data_samples - self._valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(valid_data) - self._num_valid_samples = len(self._valid_dataset) + self.logger.info('Loading dataset from {}'.format(args.train_data)) + dataset = SingleSentenceClassificationProcessor.create_from_csv(args.train_data) + num_data_samples = len(dataset) + if args.validation_data: + self.logger.info('Loading validation dataset from {}'.format(args.validation_data)) + self.valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(args.validation_data) + self.num_valid_samples = len(self.valid_dataset) + self.train_dataset = dataset + self.num_train_samples = num_data_samples else: - assert 0.0 < valid_data_ratio < 1.0, "--valid_data_ratio should be between 0.0 and 1.0" - self._num_valid_samples = num_data_samples * valid_data_ratio - self._num_train_samples = num_data_samples - self._num_valid_samples - self._train_dataset = dataset[self._num_train_samples] - self._valid_dataset = dataset[self._num_valid_samples] + assert 0.0 < args.validation_split < 1.0, "--validation_split should be between 0.0 and 1.0" + self.num_valid_samples = num_data_samples * args.validation_split + self.num_train_samples = num_data_samples - self.num_valid_samples + self.train_dataset = dataset[self.num_train_samples] + self.valid_dataset = dataset[self.num_valid_samples] + + self.train_batch_size = args.train_batch_size + self.valid_batch_size = args.valid_batch_size + self.learning_rate = args.learning_rate + self.adam_epsilon = args.adam_epsilon def run(self): - if self._framework == 'tf': + if self.framework == 'tf': return self.run_tf() return self.run_torch() @@ -95,27 +127,28 @@ class TrainCommand(BaseTransformersCLICommand): tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) # Prepare dataset as a tf.train_data.Dataset instance - train_dataset = convert_examples_to_features(self._train_dataset, self._tokenizer, mode='sequence_classification') - valid_dataset = convert_examples_to_features(self._valid_dataset, self._tokenizer, mode='sequence_classification') - train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1) - valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE) + self.logger.info('Tokenizing and processing dataset') + train_dataset = self.train_dataset.get_features(self.tokenizer) + valid_dataset = self.valid_dataset.get_features(self.tokenizer) + train_dataset = train_dataset.shuffle(128).batch(self.train_batch_size).repeat(-1) + valid_dataset = valid_dataset.batch(self.valid_batch_size) # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule - opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08) + opt = tf.keras.optimizers.Adam(learning_rate=args.learning_rate, epsilon=self.adam_epsilon) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') - model.compile(optimizer=opt, loss=loss, metrics=[metric]) + self.model.compile(optimizer=opt, loss=loss, metrics=[metric]) # Train and evaluate using tf.keras.Model.fit() - train_steps = train_examples//BATCH_SIZE - valid_steps = valid_examples//EVAL_BATCH_SIZE + train_steps = self.num_train_samples//self.train_batch_size + valid_steps = self.num_valid_samples//self.valid_batch_size - history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, - validation_data=valid_dataset, validation_steps=valid_steps) + self.logger.info('Training model') + history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, + validation_data=valid_dataset, validation_steps=valid_steps) - # Save TF2 model - os.makedirs('./save/', exist_ok=True) - model.save_pretrained('./save/') + # Save trained model + self.model.save_pretrained(self.output) diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index 39544a1239..75bed86042 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -18,6 +18,11 @@ import csv import sys import copy import json +import logging + +from ...file_utils import is_tf_available, is_torch_available + +logger = logging.getLogger(__name__) class InputExample(object): """ @@ -64,7 +69,7 @@ class InputFeatures(object): label: Label corresponding to the input """ - def __init__(self, input_ids, attention_mask, token_type_ids, label): + def __init__(self, input_ids, attention_mask=None, token_type_ids=None, label=None): self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids @@ -86,34 +91,6 @@ class InputFeatures(object): class DataProcessor(object): """Base class for data converters for sequence classification data sets.""" - def get_example_from_tensor_dict(self, tensor_dict): - """Gets an example from a dict with tensorflow tensors - - Args: - tensor_dict: Keys and values should match the corresponding Glue - tensorflow_dataset examples. - """ - raise NotImplementedError() - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - def tfds_map(self, example): - """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. - This method converts examples to the correct format.""" - if len(self.get_labels()) > 1: - example.label = self.get_labels()[int(example.label)] - return example - @classmethod def _read_tsv(cls, input_file, quotechar=None): """Reads a tab separated value file.""" @@ -129,15 +106,11 @@ class DataProcessor(object): class SingleSentenceClassificationProcessor(DataProcessor): """ Generic processor for a single sentence classification data set.""" - def __init__(self, labels=None, examples=None): + def __init__(self, labels=None, examples=None, mode='classification', verbose=False): self.labels = [] if labels is None else labels self.examples = [] if examples is None else examples - - @classmethod - def create_from_csv(cls, file_name): - processor = cls() - processor.add_examples_from_csv(file_name) - return processor + self.mode = mode + self.verbose = verbose def __len__(self): return len(self.examples) @@ -148,30 +121,40 @@ class SingleSentenceClassificationProcessor(DataProcessor): examples=self.examples[idx]) return self.examples[idx] - def get_labels(self): - """Gets the list of labels for this data set.""" - return self.labels + @classmethod + def create_from_csv(cls, file_name, **kwargs): + processor = cls(**kwargs) + processor.add_examples_from_csv(file_name) + return processor - def add_examples_from_csv(self, file_name): + def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None, + overwrite_labels=False, overwrite_examples=False): lines = self._read_tsv(file_name) - self.add_examples_from_lines(lines) - - def add_examples_from_lines(self, lines, split_name='', overwrite_labels=False, overwrite_examples=False): - """Creates examples for the training and dev sets.""" - added_labels = set() - examples = [] + texts = [] + labels = [] + ids = [] for (i, line) in enumerate(lines): - if len(line) > 2: - guid = "%s-%s" % (split_name, line[0]) if split_name else line[0] - label = line[1] - text_a = line[2] + texts.append(line[column_text]) + labels.append(line[column_label]) + if column_id is not None: + ids.append(line[column_id]) else: guid = "%s-%s" % (split_name, i) if split_name else "%s" % i - label = line[0] - text_a = line[1] + ids.append(guid) + return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples) + + def add_examples(self, texts, labels, ids=None, overwrite_labels=False, overwrite_examples=False): + if ids is None: + ids = [None] * len(texts) + assert len(texts) == len(labels) + assert len(texts) == len(ids) + + examples = [] + added_labels = set() + for (text, label, guid) in zip(texts, labels, ids): added_labels.add(label) - examples.append(InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label)) # Update examples if overwrite_examples: @@ -187,123 +170,129 @@ class SingleSentenceClassificationProcessor(DataProcessor): return self.examples + @classmethod + def create_from_examples(cls, texts, labels, **kwargs): + processor = cls(**kwargs) + processor.add_examples(texts, labels) + return processor -def convert_examples_to_features(examples, tokenizer, - mode='sequence_classification', - max_length=512, - pad_on_left=False, - pad_token=0, - pad_token_segment_id=0, - mask_padding_with_zero=True): - """ - Loads a data file into a list of ``InputFeatures`` + def get_features(self, + tokenizer, + max_length=None, + pad_on_left=False, + pad_token=0, + mask_padding_with_zero=True, + return_tensors=None): + """ + Convert examples in a list of ``InputFeatures`` - Args: - examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples. - tokenizer: Instance of a tokenizer that will tokenize the examples - max_length: Maximum example length - task: GLUE task - label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method - output_mode: String indicating the output mode. Either ``regression`` or ``classification`` - pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) - pad_token: Padding token - pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4) - mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values - and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for - actual values) + Args: + tokenizer: Instance of a tokenizer that will tokenize the examples + max_length: Maximum example length + task: GLUE task + label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method + output_mode: String indicating the output mode. Either ``regression`` or ``classification`` + pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default) + pad_token: Padding token + mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values + and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for + actual values) - Returns: - If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` - containing the task-specific features. If the input is a list of ``InputExamples``, will return - a list of task-specific ``InputFeatures`` which can be fed to the model. + Returns: + If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset`` + containing the task-specific features. If the input is a list of ``InputExamples``, will return + a list of task-specific ``InputFeatures`` which can be fed to the model. - """ - is_tf_dataset = False - if is_tf_available() and isinstance(examples, tf.data.Dataset): - is_tf_dataset = True + """ - if task is not None: - processor = glue_processors[task]() - if label_list is None: - label_list = processor.get_labels() - logger.info("Using label list %s for task %s" % (label_list, task)) - if output_mode is None: - output_mode = glue_output_modes[task] - logger.info("Using output mode %s for task %s" % (output_mode, task)) + label_map = {label: i for i, label in enumerate(self.labels)} - label_map = {label: i for i, label in enumerate(label_list)} + all_input_ids = [] + for (ex_index, example) in enumerate(self.examples): + if ex_index % 10000 == 0: + logger.info("Tokenizing example %d", ex_index) - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - logger.info("Writing example %d" % (ex_index)) - if is_tf_dataset: - example = processor.get_example_from_tensor_dict(example) + input_ids = tokenizer.encode( + example.text_a, + add_special_tokens=True, + max_length=min(max_length, tokenizer.max_len), + ) + all_input_ids.append(input_ids) - inputs = tokenizer.encode_plus( - example.text_a, - example.text_b, - add_special_tokens=True, - max_length=max_length, - ) - input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"] + batch_length = max(len(input_ids) for input_ids in all_input_ids) - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) + features = [] + for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, examples)): + if ex_index % 10000 == 0: + logger.info("Writing example %d", ex_index) + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) - # Zero-pad up to the sequence length. - padding_length = max_length - len(input_ids) - if pad_on_left: - input_ids = ([pad_token] * padding_length) + input_ids - attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask - token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids + # Zero-pad up to the sequence length. + padding_length = batch_length - len(input_ids) + if pad_on_left: + input_ids = ([pad_token] * padding_length) + input_ids + attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask + else: + input_ids = input_ids + ([pad_token] * padding_length) + attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) + + assert len(input_ids) == batch_length, "Error with input length {} vs {}".format(len(input_ids), batch_length) + assert len(attention_mask) == batch_length, "Error with input length {} vs {}".format(len(attention_mask), batch_length) + + if self.mode == "classification": + label = label_map[example.label] + elif self.mode == "regression": + label = float(example.label) + else: + raise ValueError(self.mode) + + if ex_index < 5 and self.verbose: + logger.info("*** Example ***") + logger.info("guid: %s" % (example.guid)) + logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) + logger.info("label: %s (id = %d)" % (example.label, label)) + + features.append( + InputFeatures(input_ids=input_ids, + attention_mask=attention_mask, + label=label)) + + if return_tensors is None: + return features + elif return_tensors == 'tf': + if not is_tf_available(): + raise ImportError("return_tensors set to 'tf' but TensorFlow 2.0 can't be imported") + import tensorflow as tf + def gen(): + for ex in features: + yield ({'input_ids': ex.input_ids, + 'attention_mask': ex.attention_mask}, + ex.label) + + dataset = tf.data.Dataset.from_generator(gen, + ({'input_ids': tf.int32, + 'attention_mask': tf.int32}, + tf.int64), + ({'input_ids': tf.TensorShape([None]), + 'attention_mask': tf.TensorShape([None])}, + tf.TensorShape([]))) + return dataset + elif return_tensors == 'pt': + if not is_torch_available(): + raise ImportError("return_tensors set to 'pt' but PyTorch can't be imported") + import torch + from torch.utils.data import TensorDataset + all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) + all_attention_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + if self.mode == "classification": + all_labels = torch.tensor([f.label for f in features], dtype=torch.long) + elif self.mode == "regression": + all_labels = torch.tensor([f.label for f in features], dtype=torch.float) + + dataset = TensorDataset(all_input_ids, all_attention_mask, all_labels) + return dataset else: - input_ids = input_ids + ([pad_token] * padding_length) - attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length) - token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length) - - assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length) - assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(len(attention_mask), max_length) - assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(len(token_type_ids), max_length) - - if output_mode == "classification": - label = label_map[example.label] - elif output_mode == "regression": - label = float(example.label) - else: - raise KeyError(output_mode) - - if ex_index < 5: - logger.info("*** Example ***") - logger.info("guid: %s" % (example.guid)) - logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask])) - logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids])) - logger.info("label: %s (id = %d)" % (example.label, label)) - - features.append( - InputFeatures(input_ids=input_ids, - attention_mask=attention_mask, - token_type_ids=token_type_ids, - label=label)) - - if is_tf_available() and is_tf_dataset: - def gen(): - for ex in features: - yield ({'input_ids': ex.input_ids, - 'attention_mask': ex.attention_mask, - 'token_type_ids': ex.token_type_ids}, - ex.label) - - return tf.data.Dataset.from_generator(gen, - ({'input_ids': tf.int32, - 'attention_mask': tf.int32, - 'token_type_ids': tf.int32}, - tf.int64), - ({'input_ids': tf.TensorShape([None]), - 'attention_mask': tf.TensorShape([None]), - 'token_type_ids': tf.TensorShape([None])}, - tf.TensorShape([]))) - - return features + raise ValueError("return_tensors should be one of 'tf' or 'pt'") diff --git a/transformers/pipeline.py b/transformers/pipeline.py new file mode 100644 index 0000000000..15adc620b1 --- /dev/null +++ b/transformers/pipeline.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Pipeline class: Tokenizer + Model. """ + +from __future__ import absolute_import, division, print_function, unicode_literals +import os +import logging + +from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering, + AutoModelForSequenceClassification, + AutoModelWithLMHead) +from .tokenization_auto import AutoTokenizer +from .file_utils import add_start_docstrings, is_tf_available, is_torch_available +from .data.processors import SingleSentenceClassificationProcessor + +if is_tf_available(): + import tensorflow as tf +if is_torch_available(): + import torch + +logger = logging.getLogger(__name__) + +# TF training parameters +USE_XLA = False +USE_AMP = False + +class TextClassificationPipeline(object): + r""" + :class:`~transformers.TextClassificationPipeline` is a class encapsulating a pretrained model and + its tokenizer and will be instantiated as one of the base model classes of the library + when created with the `Pipeline.from_pretrained(pretrained_model_name_or_path)` + class method. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The base model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertModel (DistilBERT model) + - contains `roberta`: RobertaModel (RoBERTa model) + - contains `bert`: BertModel (Bert model) + - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) + - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) + - contains `ctrl`: CTRLModel (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) + - contains `xlnet`: XLNetModel (XLNet model) + - contains `xlm`: XLMModel (XLM model) + """ + def __init__(self, tokenizer, model): + self.tokenizer = tokenizer + self.model = model + if is_tf_available(): + self.framework = 'tf' + elif is_torch_available(): + self.framework = 'pt' + else: + raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") + self.is_compiled = False + + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiates one of the base model classes of the library + from a pre-trained model configuration. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertModel (DistilBERT model) + - contains `roberta`: RobertaModel (RoBERTa model) + - contains `bert`: BertModel (Bert model) + - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) + - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) + - contains `ctrl`: CTRLModel (Salesforce CTRL model) + - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) + - contains `xlnet`: XLNetModel (XLNet model) + - contains `xlm`: XLMModel (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + # Extract tokenizer and model arguments + tokenizer_kwargs = {} + for key in kwargs: + if key.startswith('tokenizer_'): + # Specific to the tokenizer + tokenizer_kwargs[key.replace('tokenizer_', '')] = kwargs.pop(key) + elif not key.startswith('model_'): + # used for both the tokenizer and the model + tokenizer_kwargs[key] = kwargs[key] + + model_kwargs = {} + for key in kwargs: + if key.startswith('model_'): + # Specific to the model + model_kwargs[key.replace('model_', '')] = kwargs.pop(key) + elif not key.startswith('tokenizer_'): + # used for both the tokenizer and the model + model_kwargs[key] = kwargs[key] + + tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs) + model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) + return cls(tokenizer, model) + + + def save_pretrained(self, save_directory): + if not os.path.isdir(save_directory): + logger.error("Saving directory ({}) should be a directory".format(save_directory)) + return + self.model.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + + def compile(self, learning_rate=3e-5, epsilon=1e-8): + if self.framework == 'tf': + logger.info('Preparing model') + # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule + opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) + if USE_AMP: + # loss scaling is currently required when using mixed precision + opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') + loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) + metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') + self.model.compile(optimizer=opt, loss=loss, metrics=[metric]) + else: + raise NotImplementedError + self.is_compiled = True + + + def prepare_data(self, train_samples_text, train_samples_labels, + valid_samples_text=None, valid_samples_labels=None, + validation_split=0.1): + dataset = SingleSentenceClassificationProcessor.create_from_examples(train_samples_text, + train_samples_labels) + num_data_samples = len(dataset) + if valid_samples_text is not None and valid_samples_labels is not None: + valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(valid_samples_text, + valid_samples_labels) + num_valid_samples = len(valid_dataset) + train_dataset = dataset + num_train_samples = num_data_samples + else: + assert 0.0 < validation_split < 1.0, "validation_split should be between 0.0 and 1.0" + num_valid_samples = int(num_data_samples * validation_split) + num_train_samples = num_data_samples - num_valid_samples + train_dataset = dataset[num_train_samples] + valid_dataset = dataset[num_valid_samples] + + logger.info('Tokenizing and processing dataset') + train_dataset = train_dataset.get_features(self.tokenizer, return_tensors=self.framework) + valid_dataset = valid_dataset.get_features(self.tokenizer, return_tensors=self.framework) + return train_dataset, valid_dataset, num_train_samples, num_valid_samples + + + def fit(self, train_samples_text, train_samples_labels, + valid_samples_text=None, valid_samples_labels=None, + train_batch_size=None, valid_batch_size=None, + validation_split=0.1, + **kwargs): + + if not self.is_compiled: + self.compile() + + datasets = self.prepare_data(train_samples_text, train_samples_labels, + valid_samples_text, valid_samples_labels, + validation_split) + train_dataset, valid_dataset, num_train_samples, num_valid_samples = datasets + + train_steps = num_train_samples//train_batch_size + valid_steps = num_valid_samples//valid_batch_size + + if self.framework == 'tf': + # Prepare dataset as a tf.train_data.Dataset instance + train_dataset = train_dataset.shuffle(128).batch(train_batch_size).repeat(-1) + valid_dataset = valid_dataset.batch(valid_batch_size) + + logger.info('Training TF 2.0 model') + history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, + validation_data=valid_dataset, validation_steps=valid_steps, **kwargs) + else: + raise NotImplementedError + + + def __call__(self, text): + inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=self.framework) + if self.framework == 'tf': + # TODO trace model + predictions = self.model(**inputs)[0] + else: + with torch.no_grad(): + predictions = self.model(**inputs)[0] + + return predictions.numpy().tolist() From b81ab431f26e4a2fadf37bdd803ec26c66ee719c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 17 Oct 2019 12:06:27 +0200 Subject: [PATCH 064/302] updating AutoModels and AutoConfiguration - adding pipelines --- transformers/configuration_auto.py | 28 +++++ transformers/modeling_auto.py | 182 +++++++++++++++++++++++++++-- transformers/modeling_tf_auto.py | 169 +++++++++++++++++++++++++-- transformers/pipeline.py | 83 ++----------- 4 files changed, 372 insertions(+), 90 deletions(-) diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index 43f251bd0c..47379d2f5a 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -61,6 +61,34 @@ class AutoConfig(object): raise EnvironmentError("AutoConfig is designed to be instantiated " "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.") + @classmethod + def for_model(cls, model_type, *args, **kwargs): + if 'distilbert' in model_type: + return DistilBertConfig(*args, **kwargs) + elif 'roberta' in model_type: + return RobertaConfig(*args, **kwargs) + elif 'bert' in model_type: + return BertConfig(*args, **kwargs) + elif 'openai-gpt' in model_type: + return OpenAIGPTConfig(*args, **kwargs) + elif 'gpt2' in model_type: + return GPT2Config(*args, **kwargs) + elif 'transfo-xl' in model_type: + return TransfoXLConfig(*args, **kwargs) + elif 'xlnet' in model_type: + return XLNetConfig(*args, **kwargs) + elif 'xlm' in model_type: + return XLMConfig(*args, **kwargs) + elif 'ctrl' in model_type: + return CTRLConfig(*args, **kwargs) + elif 'albert' in model_type: + return AlbertConfig(*args, **kwargs) + elif 'camembert' in model_type: + return CamembertConfig(*args, **kwargs) + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'xlm', 'roberta', 'ctrl', 'camembert', 'albert'".format(model_type)) + @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): r""" Instantiate a one of the configuration classes of the library diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index b63e43d73b..0c8bffa883 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -18,6 +18,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging +from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig, + DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, + TransfoXLConfig, XLMConfig, XLNetConfig) + from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel @@ -27,8 +31,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice +from .modeling_camembert import CamembertModel, CamembertForQuestionAnswering, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering from .modeling_utils import PreTrainedModel, SequenceSummary @@ -43,7 +46,7 @@ class AutoModel(object): :class:`~transformers.AutoModel` is a generic model class that will be instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` - class method. + or the `AutoModel.from_config(config)` class methods. The `from_pretrained()` method takes care of returning the correct model class instance using pattern matching on the `pretrained_model_name_or_path` string. @@ -66,7 +69,54 @@ class AutoModel(object): """ def __init__(self): raise EnvironmentError("AutoModel is designed to be instantiated " - "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.") + "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModel.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) + - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) + - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return DistilBertModel(config) + elif isinstance(config, RobertaConfig): + return RobertaModel(config) + elif isinstance(config, BertConfig): + return BertModel(config) + elif isinstance(config, OpenAIGPTConfig): + return OpenAIGPTModel(config) + elif isinstance(config, GPT2Config): + return GPT2Model(config) + elif isinstance(config, TransfoXLConfig): + return TransfoXLModel(config) + elif isinstance(config, XLNetConfig): + return XLNetModel(config) + elif isinstance(config, XLMConfig): + return XLMModel(config) + elif isinstance(config, CTRLConfig): + return CTRLModel(config) + elif isinstance(config, AlbertConfig): + return AlbertModel(config) + elif isinstance(config, CamembertConfig): + return CamembertModel(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -201,7 +251,54 @@ class AutoModelWithLMHead(object): """ def __init__(self): raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " - "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelWithLMHead.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) + - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) + - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return DistilBertForMaskedLM(config) + elif isinstance(config, RobertaConfig): + return RobertaForMaskedLM(config) + elif isinstance(config, BertConfig): + return BertForMaskedLM(config) + elif isinstance(config, OpenAIGPTConfig): + return OpenAIGPTLMHeadModel(config) + elif isinstance(config, GPT2Config): + return GPT2LMHeadModel(config) + elif isinstance(config, TransfoXLConfig): + return TransfoXLLMHeadModel(config) + elif isinstance(config, XLNetConfig): + return XLNetLMHeadModel(config) + elif isinstance(config, XLMConfig): + return XLMWithLMHeadModel(config) + elif isinstance(config, CTRLConfig): + return CTRLLMHeadModel(config) + elif isinstance(config, AlbertConfig): + return AlbertLMHeadModel(config) + elif isinstance(config, CamembertConfig): + return CamembertLMHeadModel(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -333,8 +430,43 @@ class AutoModelForSequenceClassification(object): This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): - raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " - "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError("AutoModelForSequenceClassification is designed to be instantiated " + "using the `AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForSequenceClassification.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, AlbertConfig): + return AlbertForSequenceClassification(config) + elif isintance(config, CamembertConfig): + return CamembertForSequenceClassification(config) + elif isinstance(config, DistilBertConfig): + return DistilBertForSequenceClassification(config) + elif isinstance(config, RobertaConfig): + return RobertaForSequenceClassification(config) + elif isinstance(config, BertConfig): + return BertForSequenceClassification(config) + elif isinstance(config, XLNetConfig): + return XLNetForSequenceClassification(config) + elif isinstance(config, XLMConfig): + return XLMForSequenceClassification(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -453,8 +585,40 @@ class AutoModelForQuestionAnswering(object): This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): - raise EnvironmentError("AutoModelWithLMHead is designed to be instantiated " - "using the `AutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError("AutoModelForQuestionAnswering is designed to be instantiated " + "using the `AutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForQuestionAnswering.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isintance(config, AlbertConfig): + return AlbertForQuestionAnswering(config) + elif isintance(config, CamembertConfig): + return CamembertForQuestionAnswering(config) + elif isinstance(config, DistilBertConfig): + return DistilBertForQuestionAnswering(config) + elif isinstance(config, BertConfig): + return BertForQuestionAnswering(config) + elif isinstance(config, XLNetConfig): + return XLNetForQuestionAnswering(config) + elif isinstance(config, XLMConfig): + return XLMForQuestionAnswering(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index cfe19ead2a..e78b91cfcc 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -18,6 +18,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging +from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig, + GPT2Config, OpenAIGPTConfig, RobertaConfig, + TransfoXLConfig, XLMConfig, XLNetConfig) + from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel @@ -59,7 +63,50 @@ class TFAutoModel(object): """ def __init__(self): raise EnvironmentError("TFAutoModel is designed to be instantiated " - "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` method.") + "using the `TFAutoModel.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModel.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: TFDistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: TFRobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: TFBertModel (Bert model) + - isInstance of `openai-gpt` configuration class: TFOpenAIGPTModel (OpenAI GPT model) + - isInstance of `gpt2` configuration class: TFGPT2Model (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: TFCTRLModel (Salesforce CTRL model) + - isInstance of `transfo-xl` configuration class: TFTransfoXLModel (Transformer-XL model) + - isInstance of `xlnet` configuration class: TFXLNetModel (XLNet model) + - isInstance of `xlm` configuration class: TFXLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = TFAutoModel.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return TFDistilBertModel(config) + elif isinstance(config, RobertaConfig): + return TFRobertaModel(config) + elif isinstance(config, BertConfig): + return TFBertModel(config) + elif isinstance(config, OpenAIGPTConfig): + return TFOpenAIGPTModel(config) + elif isinstance(config, GPT2Config): + return TFGPT2Model(config) + elif isinstance(config, TransfoXLConfig): + return TFTransfoXLModel(config) + elif isinstance(config, XLNetConfig): + return TFXLNetModel(config) + elif isinstance(config, XLMConfig): + return TFXLMModel(config) + elif isinstance(config, CTRLConfig): + return TFCTRLModel(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -156,7 +203,7 @@ class TFAutoModel(object): return TFCTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) @@ -186,7 +233,50 @@ class TFAutoModelWithLMHead(object): """ def __init__(self): raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated " - "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelWithLMHead.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `openai-gpt` configuration class: OpenAIGPTModel (OpenAI GPT model) + - isInstance of `gpt2` configuration class: GPT2Model (OpenAI GPT-2 model) + - isInstance of `ctrl` configuration class: CTRLModel (Salesforce CTRL model) + - isInstance of `transfo-xl` configuration class: TransfoXLModel (Transformer-XL model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelWithLMHead.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return TFDistilBertForMaskedLM(config) + elif isinstance(config, RobertaConfig): + return TFRobertaForMaskedLM(config) + elif isinstance(config, BertConfig): + return TFBertForMaskedLM(config) + elif isinstance(config, OpenAIGPTConfig): + return TFOpenAIGPTLMHeadModel(config) + elif isinstance(config, GPT2Config): + return TFGPT2LMHeadModel(config) + elif isinstance(config, TransfoXLConfig): + return TFTransfoXLLMHeadModel(config) + elif isinstance(config, XLNetConfig): + return TFXLNetLMHeadModel(config) + elif isinstance(config, XLMConfig): + return TFXLMWithLMHeadModel(config) + elif isinstance(config, CTRLConfig): + return TFCTRLLMHeadModel(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -287,7 +377,7 @@ class TFAutoModelWithLMHead(object): return TFCTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " + "'distilbert', 'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " "'xlm', 'roberta', 'ctrl'".format(pretrained_model_name_or_path)) @@ -312,8 +402,39 @@ class TFAutoModelForSequenceClassification(object): This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): - raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated " - "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError("TFAutoModelForSequenceClassification is designed to be instantiated " + "using the `TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForSequenceClassification.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `roberta` configuration class: RobertaModel (RoBERTa model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return TFDistilBertForSequenceClassification(config) + elif isinstance(config, RobertaConfig): + return TFRobertaForSequenceClassification(config) + elif isinstance(config, BertConfig): + return TFBertForSequenceClassification(config) + elif isinstance(config, XLNetConfig): + return TFXLNetForSequenceClassification(config) + elif isinstance(config, XLMConfig): + return TFXLMForSequenceClassification(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -405,7 +526,7 @@ class TFAutoModelForSequenceClassification(object): return TFXLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)) + "'distilbert', 'bert', 'xlnet', 'xlm', 'roberta'".format(pretrained_model_name_or_path)) class TFAutoModelForQuestionAnswering(object): @@ -428,8 +549,36 @@ class TFAutoModelForQuestionAnswering(object): This class cannot be instantiated using `__init__()` (throws an error). """ def __init__(self): - raise EnvironmentError("TFAutoModelWithLMHead is designed to be instantiated " - "using the `TFAutoModelWithLMHead.from_pretrained(pretrained_model_name_or_path)` method.") + raise EnvironmentError("TFAutoModelForQuestionAnswering is designed to be instantiated " + "using the `TFAutoModelForQuestionAnswering.from_pretrained(pretrained_model_name_or_path)` or " + "`TFAutoModelForQuestionAnswering.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, DistilBertConfig): + return TFDistilBertForQuestionAnswering(config) + elif isinstance(config, BertConfig): + return TFBertForQuestionAnswering(config) + elif isinstance(config, XLNetConfig): + return TFXLNetForQuestionAnswering(config) + elif isinstance(config, XLMConfig): + return TFXLMForQuestionAnswering(config) + raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): @@ -518,4 +667,4 @@ class TFAutoModelForQuestionAnswering(object): return TFXLMForQuestionAnsweringSimple.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)) + "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)) diff --git a/transformers/pipeline.py b/transformers/pipeline.py index 15adc620b1..f2c55def92 100644 --- a/transformers/pipeline.py +++ b/transformers/pipeline.py @@ -58,7 +58,7 @@ class TextClassificationPipeline(object): - contains `xlnet`: XLNetModel (XLNet model) - contains `xlm`: XLMModel (XLM model) """ - def __init__(self, tokenizer, model): + def __init__(self, tokenizer, model, is_compiled=False, is_trained=False): self.tokenizer = tokenizer self.model = model if is_tf_available(): @@ -67,78 +67,13 @@ class TextClassificationPipeline(object): self.framework = 'pt' else: raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") - self.is_compiled = False + self.is_compiled = is_compiled + self.is_trained = is_trained @classmethod def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiates one of the base model classes of the library - from a pre-trained model configuration. - - The model class to instantiate is selected as the first pattern matching - in the `pretrained_model_name_or_path` string (in the following order): - - contains `distilbert`: DistilBertModel (DistilBERT model) - - contains `roberta`: RobertaModel (RoBERTa model) - - contains `bert`: BertModel (Bert model) - - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - - contains `ctrl`: CTRLModel (Salesforce CTRL model) - - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - - contains `xlnet`: XLNetModel (XLNet model) - - contains `xlm`: XLMModel (XLM model) - - The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) - To train the model, you should first set it back in training mode with `model.train()` - - Params: - pretrained_model_name_or_path: either: - - - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. - - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: - Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - - - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or - - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. - - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. - - state_dict: (`optional`) dict: - an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. - This option can be used if you want to create a model from a pretrained configuration but load your own weights. - In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. - - cache_dir: (`optional`) string: - Path to a directory in which a downloaded pre-trained model - configuration should be cached if the standard cache should not be used. - - force_download: (`optional`) boolean, default False: - Force to (re-)download the model weights and configuration files and override the cached versions if they exists. - - proxies: (`optional`) dict, default None: - A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. - The proxies are used on each request. - - output_loading_info: (`optional`) boolean: - Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. - - kwargs: (`optional`) Remaining dictionary of keyword arguments: - Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: - - - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) - - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. - - Examples:: - - model = AutoModel.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. - model = AutoModel.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` - model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading - assert model.config.output_attention == True - # Loading from a TF checkpoint file instead of a PyTorch model (slower) - config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') - model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) - + r""" Instantiates a pipeline from a pre-trained tokenizer and model. """ # Extract tokenizer and model arguments tokenizer_kwargs = {} @@ -159,9 +94,11 @@ class TextClassificationPipeline(object): # used for both the tokenizer and the model model_kwargs[key] = kwargs[key] + model_kwargs['output_loading_info'] = True tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs) - model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) - return cls(tokenizer, model) + model, loading_info = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) + + return cls(tokenizer, model, is_trained=bool(not loading_info['missing_keys'])) def save_pretrained(self, save_directory): @@ -240,9 +177,13 @@ class TextClassificationPipeline(object): validation_data=valid_dataset, validation_steps=valid_steps, **kwargs) else: raise NotImplementedError + self.is_trained = True def __call__(self, text): + if not self.is_trained: + logger.error("Some weights of the model are not trained. Please fine-tune the model on a classification task before using it.") + inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=self.framework) if self.framework == 'tf': # TODO trace model From 7c1697562a38200e0e1a651b014ff0fc07343dd1 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 17 Oct 2019 13:17:05 +0200 Subject: [PATCH 065/302] compatibility with sklearn and keras --- transformers/commands/train.py | 2 +- transformers/data/processors/utils.py | 2 +- transformers/pipeline.py | 87 ++++++++++++++++++--------- 3 files changed, 59 insertions(+), 32 deletions(-) diff --git a/transformers/commands/train.py b/transformers/commands/train.py index fc89d48594..878ad21037 100644 --- a/transformers/commands/train.py +++ b/transformers/commands/train.py @@ -83,7 +83,7 @@ class TrainCommand(BaseTransformersCLICommand): self.logger.info('Loading model {}'.format(args.model_name)) self.model_name = args.model_name - self.tokenizer = AutoTokenizer.from_pretrained(args.model_name) + self.pipeline = AutoTokenizer.from_pretrained(args.model_name) if args.task == 'text_classification': self.model = SequenceClassifModel.from_pretrained(args.model_name) elif args.task == 'token_classification': diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index 75bed86042..61b139c02b 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -222,7 +222,7 @@ class SingleSentenceClassificationProcessor(DataProcessor): batch_length = max(len(input_ids) for input_ids in all_input_ids) features = [] - for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, examples)): + for (ex_index, (input_ids, example)) in enumerate(zip(all_input_ids, self.examples)): if ex_index % 10000 == 0: logger.info("Writing example %d", ex_index) # The mask has 1 for real tokens and 0 for padding tokens. Only real diff --git a/transformers/pipeline.py b/transformers/pipeline.py index f2c55def92..dc7bcaeac3 100644 --- a/transformers/pipeline.py +++ b/transformers/pipeline.py @@ -109,7 +109,32 @@ class TextClassificationPipeline(object): self.tokenizer.save_pretrained(save_directory) - def compile(self, learning_rate=3e-5, epsilon=1e-8): + def prepare_data(self, train_samples_text, train_samples_labels, + valid_samples_text=None, valid_samples_labels=None, + validation_split=0.1, **kwargs): + dataset = SingleSentenceClassificationProcessor.create_from_examples(train_samples_text, + train_samples_labels) + num_data_samples = len(dataset) + if valid_samples_text is not None and valid_samples_labels is not None: + valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(valid_samples_text, + valid_samples_labels) + num_valid_samples = len(valid_dataset) + train_dataset = dataset + num_train_samples = num_data_samples + else: + assert 0.0 <= validation_split <= 1.0, "validation_split should be between 0.0 and 1.0" + num_valid_samples = int(num_data_samples * validation_split) + num_train_samples = num_data_samples - num_valid_samples + train_dataset = dataset[num_train_samples] + valid_dataset = dataset[num_valid_samples] + + logger.info('Tokenizing and processing dataset') + train_dataset = train_dataset.get_features(self.tokenizer, return_tensors=self.framework) + valid_dataset = valid_dataset.get_features(self.tokenizer, return_tensors=self.framework) + return train_dataset, valid_dataset, num_train_samples, num_valid_samples + + + def compile(self, learning_rate=3e-5, epsilon=1e-8, **kwargs): if self.framework == 'tf': logger.info('Preparing model') # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule @@ -125,39 +150,20 @@ class TextClassificationPipeline(object): self.is_compiled = True - def prepare_data(self, train_samples_text, train_samples_labels, - valid_samples_text=None, valid_samples_labels=None, - validation_split=0.1): - dataset = SingleSentenceClassificationProcessor.create_from_examples(train_samples_text, - train_samples_labels) - num_data_samples = len(dataset) - if valid_samples_text is not None and valid_samples_labels is not None: - valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(valid_samples_text, - valid_samples_labels) - num_valid_samples = len(valid_dataset) - train_dataset = dataset - num_train_samples = num_data_samples - else: - assert 0.0 < validation_split < 1.0, "validation_split should be between 0.0 and 1.0" - num_valid_samples = int(num_data_samples * validation_split) - num_train_samples = num_data_samples - num_valid_samples - train_dataset = dataset[num_train_samples] - valid_dataset = dataset[num_valid_samples] - - logger.info('Tokenizing and processing dataset') - train_dataset = train_dataset.get_features(self.tokenizer, return_tensors=self.framework) - valid_dataset = valid_dataset.get_features(self.tokenizer, return_tensors=self.framework) - return train_dataset, valid_dataset, num_train_samples, num_valid_samples - - - def fit(self, train_samples_text, train_samples_labels, + def fit(self, train_samples_text=None, train_samples_labels=None, valid_samples_text=None, valid_samples_labels=None, train_batch_size=None, valid_batch_size=None, validation_split=0.1, **kwargs): + # Generic compatibility with sklearn and Keras + if 'y' in kwargs and train_samples_labels is None: + train_samples_labels = kwargs.pop('y') + if 'X' in kwargs and train_samples_text is None: + train_samples_text = kwargs.pop('X') + if not self.is_compiled: - self.compile() + self.compile(**kwargs) datasets = self.prepare_data(train_samples_text, train_samples_labels, valid_samples_text, valid_samples_labels, @@ -180,11 +186,32 @@ class TextClassificationPipeline(object): self.is_trained = True - def __call__(self, text): + def fit_transform(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + self.fit(*texts, **kwargs) + return self(*texts, **kwargs) + + + def transform(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + + def predict(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + + def __call__(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + if 'X' in kwargs and not texts: + texts = kwargs.pop('X') + if not self.is_trained: logger.error("Some weights of the model are not trained. Please fine-tune the model on a classification task before using it.") - inputs = self.tokenizer.encode_plus(text, add_special_tokens=True, return_tensors=self.framework) + inputs = self.tokenizer.batch_encode_plus(texts, add_special_tokens=True, return_tensors=self.framework) + if self.framework == 'tf': # TODO trace model predictions = self.model(**inputs)[0] From 31a3a73ee39d7de28e69e62d8c1a0988a765a0e0 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 17 Oct 2019 15:10:11 +0200 Subject: [PATCH 066/302] updating CLI --- transformers-cli | 4 ++ transformers/__init__.py | 4 +- transformers/commands/train.py | 89 ++++++++++--------------- transformers/data/processors/utils.py | 48 +++++++++----- transformers/modeling_tf_utils.py | 31 +++++++++ transformers/pipeline.py | 95 ++++++++++++++------------- 6 files changed, 155 insertions(+), 116 deletions(-) diff --git a/transformers-cli b/transformers-cli index 7b0905d4b4..397b382308 100644 --- a/transformers-cli +++ b/transformers-cli @@ -3,6 +3,8 @@ from argparse import ArgumentParser from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands +from transformers.commands.train import TrainCommand +from transformers.commands.convert import ConvertCommand if __name__ == '__main__': parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') @@ -11,6 +13,8 @@ if __name__ == '__main__': # Register commands ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) + TrainCommand.register_subcommand(commands_parser) + ConvertCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/transformers/__init__.py b/transformers/__init__.py index a71a291a44..26036d2e8d 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -25,7 +25,6 @@ from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH from .data import (is_sklearn_available, InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor, - convert_examples_to_features, glue_output_modes, glue_convert_examples_to_features, glue_processors, glue_tasks_num_labels, xnli_output_modes, xnli_processors, xnli_tasks_num_labels, @@ -66,6 +65,9 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +# Pipelines +from .pipeline import TextClassificationPipeline + # Modeling if is_torch_available(): from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) diff --git a/transformers/commands/train.py b/transformers/commands/train.py index 878ad21037..7b26745881 100644 --- a/transformers/commands/train.py +++ b/transformers/commands/train.py @@ -3,14 +3,11 @@ from argparse import ArgumentParser, Namespace from logging import getLogger from transformers.commands import BaseTransformersCLICommand -from transformers import (AutoTokenizer, is_tf_available, is_torch_available, - SingleSentenceClassificationProcessor, - convert_examples_to_features) -if is_tf_available(): - from transformers import TFAutoModelForSequenceClassification as SequenceClassifModel -elif is_torch_available(): - from transformers import AutoModelForSequenceClassification as SequenceClassifModel -else: +from transformers import (is_tf_available, is_torch_available, + TextClassificationPipeline, + SingleSentenceClassificationProcessor as Processor) + +if not is_tf_available() and not is_torch_available(): raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") # TF training parameters @@ -35,16 +32,18 @@ class TrainCommand(BaseTransformersCLICommand): :return: """ train_parser = parser.add_parser('train', help='CLI tool to train a model on a task.') + train_parser.add_argument('--train_data', type=str, required=True, help="path to train (and optionally evaluation) dataset as a csv with " "tab separated labels and sentences.") - train_parser.add_argument('--column_label', type=int, default=0, help='Column of the dataset csv file with example labels.') train_parser.add_argument('--column_text', type=int, default=1, help='Column of the dataset csv file with example texts.') train_parser.add_argument('--column_id', type=int, default=2, help='Column of the dataset csv file with example ids.') + train_parser.add_argument('--skip_first_row', action='store_true', + help='Skip the first row of the csv file (headers).') train_parser.add_argument('--validation_data', type=str, default='', help='path to validation dataset.') @@ -74,39 +73,38 @@ class TrainCommand(BaseTransformersCLICommand): self.framework = 'tf' if is_tf_available() else 'torch' - os.makedirs(args.output) + os.makedirs(args.output, exist_ok=True) + assert os.path.isdir(args.output) self.output = args.output self.column_label = args.column_label self.column_text = args.column_text self.column_id = args.column_id - self.logger.info('Loading model {}'.format(args.model_name)) - self.model_name = args.model_name - self.pipeline = AutoTokenizer.from_pretrained(args.model_name) + self.logger.info('Loading {} pipeline for {}'.format(args.task, args.model)) if args.task == 'text_classification': - self.model = SequenceClassifModel.from_pretrained(args.model_name) + self.pipeline = TextClassificationPipeline.from_pretrained(args.model) elif args.task == 'token_classification': raise NotImplementedError elif args.task == 'question_answering': raise NotImplementedError self.logger.info('Loading dataset from {}'.format(args.train_data)) - dataset = SingleSentenceClassificationProcessor.create_from_csv(args.train_data) - num_data_samples = len(dataset) + self.train_dataset = Processor.create_from_csv(args.train_data, + column_label=args.column_label, + column_text=args.column_text, + column_id=args.column_id, + skip_first_row=args.skip_first_row) + self.valid_dataset = None if args.validation_data: self.logger.info('Loading validation dataset from {}'.format(args.validation_data)) - self.valid_dataset = SingleSentenceClassificationProcessor.create_from_csv(args.validation_data) - self.num_valid_samples = len(self.valid_dataset) - self.train_dataset = dataset - self.num_train_samples = num_data_samples - else: - assert 0.0 < args.validation_split < 1.0, "--validation_split should be between 0.0 and 1.0" - self.num_valid_samples = num_data_samples * args.validation_split - self.num_train_samples = num_data_samples - self.num_valid_samples - self.train_dataset = dataset[self.num_train_samples] - self.valid_dataset = dataset[self.num_valid_samples] + self.valid_dataset = Processor.create_from_csv(args.validation_data, + column_label=args.column_label, + column_text=args.column_text, + column_id=args.column_id, + skip_first_row=args.skip_first_row) + self.validation_split = args.validation_split self.train_batch_size = args.train_batch_size self.valid_batch_size = args.valid_batch_size self.learning_rate = args.learning_rate @@ -121,34 +119,13 @@ class TrainCommand(BaseTransformersCLICommand): raise NotImplementedError def run_tf(self): - import tensorflow as tf + self.pipeline.fit(self.train_dataset, + validation_data=self.valid_dataset, + validation_split=self.validation_split, + learning_rate=self.learning_rate, + adam_epsilon=self.adam_epsilon, + train_batch_size=self.train_batch_size, + valid_batch_size=self.valid_batch_size) - tf.config.optimizer.set_jit(USE_XLA) - tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP}) - - # Prepare dataset as a tf.train_data.Dataset instance - self.logger.info('Tokenizing and processing dataset') - train_dataset = self.train_dataset.get_features(self.tokenizer) - valid_dataset = self.valid_dataset.get_features(self.tokenizer) - train_dataset = train_dataset.shuffle(128).batch(self.train_batch_size).repeat(-1) - valid_dataset = valid_dataset.batch(self.valid_batch_size) - - # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule - opt = tf.keras.optimizers.Adam(learning_rate=args.learning_rate, epsilon=self.adam_epsilon) - if USE_AMP: - # loss scaling is currently required when using mixed precision - opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') - self.model.compile(optimizer=opt, loss=loss, metrics=[metric]) - - # Train and evaluate using tf.keras.Model.fit() - train_steps = self.num_train_samples//self.train_batch_size - valid_steps = self.num_valid_samples//self.valid_batch_size - - self.logger.info('Training model') - history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, - validation_data=valid_dataset, validation_steps=valid_steps) - - # Save trained model - self.model.save_pretrained(self.output) + # Save trained pipeline + self.pipeline.save_pretrained(self.output) diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py index 61b139c02b..ee234e6e90 100644 --- a/transformers/data/processors/utils.py +++ b/transformers/data/processors/utils.py @@ -122,14 +122,30 @@ class SingleSentenceClassificationProcessor(DataProcessor): return self.examples[idx] @classmethod - def create_from_csv(cls, file_name, **kwargs): + def create_from_csv(cls, file_name, split_name='', column_label=0, column_text=1, + column_id=None, skip_first_row=False, **kwargs): processor = cls(**kwargs) - processor.add_examples_from_csv(file_name) + processor.add_examples_from_csv(file_name, + split_name=split_name, + column_label=column_label, + column_text=column_text, + column_id=column_id, + skip_first_row=skip_first_row, + overwrite_labels=True, + overwrite_examples=True) + return processor + + @classmethod + def create_from_examples(cls, texts_or_text_and_labels, labels=None, **kwargs): + processor = cls(**kwargs) + processor.add_examples(texts_or_text_and_labels, labels=labels) return processor def add_examples_from_csv(self, file_name, split_name='', column_label=0, column_text=1, column_id=None, - overwrite_labels=False, overwrite_examples=False): + skip_first_row=False, overwrite_labels=False, overwrite_examples=False): lines = self._read_tsv(file_name) + if skip_first_row: + lines = lines[1:] texts = [] labels = [] ids = [] @@ -144,15 +160,21 @@ class SingleSentenceClassificationProcessor(DataProcessor): return self.add_examples(texts, labels, ids, overwrite_labels=overwrite_labels, overwrite_examples=overwrite_examples) - def add_examples(self, texts, labels, ids=None, overwrite_labels=False, overwrite_examples=False): + def add_examples(self, texts_or_text_and_labels, labels=None, ids=None, + overwrite_labels=False, overwrite_examples=False): + assert labels is None or len(texts_or_text_and_labels) == len(labels) + assert ids is None or len(texts_or_text_and_labels) == len(ids) if ids is None: - ids = [None] * len(texts) - assert len(texts) == len(labels) - assert len(texts) == len(ids) - + ids = [None] * len(texts_or_text_and_labels) + if labels is None: + labels = [None] * len(texts_or_text_and_labels) examples = [] added_labels = set() - for (text, label, guid) in zip(texts, labels, ids): + for (text_or_text_and_label, label, guid) in zip(texts_or_text_and_labels, labels, ids): + if isinstance(text_or_text_and_label, (tuple, list)) and label is None: + text, label = text_or_text_and_label + else: + text = text_or_text_and_label added_labels.add(label) examples.append(InputExample(guid=guid, text_a=text, text_b=None, label=label)) @@ -170,12 +192,6 @@ class SingleSentenceClassificationProcessor(DataProcessor): return self.examples - @classmethod - def create_from_examples(cls, texts, labels, **kwargs): - processor = cls(**kwargs) - processor.add_examples(texts, labels) - return processor - def get_features(self, tokenizer, max_length=None, @@ -204,6 +220,8 @@ class SingleSentenceClassificationProcessor(DataProcessor): a list of task-specific ``InputFeatures`` which can be fed to the model. """ + if max_length is None: + max_length = tokenizer.max_len label_map = {label: i for i, label in enumerate(self.labels)} diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index ed8fdb74c9..6c48f3eed2 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -22,6 +22,8 @@ import logging import os import tensorflow as tf +from tensorflow.python.keras.saving import hdf5_format +import h5py from .configuration_utils import PretrainedConfig from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME @@ -206,6 +208,9 @@ class TFPreTrainedModel(tf.keras.Model): A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + kwargs: (`optional`) Remaining dictionary of keyword arguments: Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: @@ -229,6 +234,7 @@ class TFPreTrainedModel(tf.keras.Model): force_download = kwargs.pop('force_download', False) resume_download = kwargs.pop('resume_download', False) proxies = kwargs.pop('proxies', None) + output_loading_info = kwargs.pop('output_loading_info', False) # Load config if config is None: @@ -304,6 +310,31 @@ class TFPreTrainedModel(tf.keras.Model): ret = model(model.dummy_inputs, training=False) # Make sure restore ops are run + # Check if the models are the same to output loading informations + with h5py.File(resolved_archive_file, 'r') as f: + if 'layer_names' not in f.attrs and 'model_weights' in f: + f = f['model_weights'] + hdf5_layer_names = set(hdf5_format.load_attributes_from_hdf5_group(f, 'layer_names')) + model_layer_names = set(layer.name for layer in model.layers) + missing_keys = list(model_layer_names - hdf5_layer_names) + unexpected_keys = list(hdf5_layer_names - model_layer_names) + error_msgs = [] + + if len(missing_keys) > 0: + logger.info("Layers of {} not initialized from pretrained model: {}".format( + model.__class__.__name__, missing_keys)) + if len(unexpected_keys) > 0: + logger.info("Layers from pretrained model not used in {}: {}".format( + model.__class__.__name__, unexpected_keys)) + if len(error_msgs) > 0: + raise RuntimeError('Error(s) in loading weights for {}:\n\t{}'.format( + model.__class__.__name__, "\n\t".join(error_msgs))) + if output_loading_info: + loading_info = {"missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "error_msgs": error_msgs} + return model, loading_info + return model class TFConv1D(tf.keras.layers.Layer): diff --git a/transformers/pipeline.py b/transformers/pipeline.py index dc7bcaeac3..6e55ca4d7e 100644 --- a/transformers/pipeline.py +++ b/transformers/pipeline.py @@ -17,18 +17,22 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os import logging +import six -from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering, - AutoModelForSequenceClassification, - AutoModelWithLMHead) from .tokenization_auto import AutoTokenizer from .file_utils import add_start_docstrings, is_tf_available, is_torch_available from .data.processors import SingleSentenceClassificationProcessor if is_tf_available(): import tensorflow as tf + from .modeling_tf_auto import (TFAutoModel, TFAutoModelForQuestionAnswering, + TFAutoModelForSequenceClassification, + TFAutoModelWithLMHead) if is_torch_available(): import torch + from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering, + AutoModelForSequenceClassification, + AutoModelWithLMHead) logger = logging.getLogger(__name__) @@ -61,12 +65,6 @@ class TextClassificationPipeline(object): def __init__(self, tokenizer, model, is_compiled=False, is_trained=False): self.tokenizer = tokenizer self.model = model - if is_tf_available(): - self.framework = 'tf' - elif is_torch_available(): - self.framework = 'pt' - else: - raise ImportError("At least one of PyTorch or TensorFlow 2.0+ should be installed to use CLI training") self.is_compiled = is_compiled self.is_trained = is_trained @@ -94,9 +92,12 @@ class TextClassificationPipeline(object): # used for both the tokenizer and the model model_kwargs[key] = kwargs[key] - model_kwargs['output_loading_info'] = True tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs) - model, loading_info = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) + model_kwargs['output_loading_info'] = True + if is_tf_available(): + model, loading_info = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) + else: + model, loading_info = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) return cls(tokenizer, model, is_trained=bool(not loading_info['missing_keys'])) @@ -109,36 +110,42 @@ class TextClassificationPipeline(object): self.tokenizer.save_pretrained(save_directory) - def prepare_data(self, train_samples_text, train_samples_labels, - valid_samples_text=None, valid_samples_labels=None, + def prepare_data(self, x, y=None, + validation_data=None, validation_split=0.1, **kwargs): - dataset = SingleSentenceClassificationProcessor.create_from_examples(train_samples_text, - train_samples_labels) + dataset = x + if not isinstance(x, SingleSentenceClassificationProcessor): + dataset = SingleSentenceClassificationProcessor.create_from_examples(x, y) num_data_samples = len(dataset) - if valid_samples_text is not None and valid_samples_labels is not None: - valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(valid_samples_text, - valid_samples_labels) + + if validation_data is not None: + valid_dataset = validation_data + if not isinstance(validation_data, SingleSentenceClassificationProcessor): + valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(validation_data) + num_valid_samples = len(valid_dataset) train_dataset = dataset num_train_samples = num_data_samples else: assert 0.0 <= validation_split <= 1.0, "validation_split should be between 0.0 and 1.0" - num_valid_samples = int(num_data_samples * validation_split) + num_valid_samples = max(int(num_data_samples * validation_split), 1) num_train_samples = num_data_samples - num_valid_samples - train_dataset = dataset[num_train_samples] - valid_dataset = dataset[num_valid_samples] + train_dataset = dataset[num_valid_samples:] + valid_dataset = dataset[:num_valid_samples] logger.info('Tokenizing and processing dataset') - train_dataset = train_dataset.get_features(self.tokenizer, return_tensors=self.framework) - valid_dataset = valid_dataset.get_features(self.tokenizer, return_tensors=self.framework) - return train_dataset, valid_dataset, num_train_samples, num_valid_samples + train_dataset = train_dataset.get_features(self.tokenizer, + return_tensors='tf' if is_tf_available() else 'pt') + valid_dataset = valid_dataset.get_features(self.tokenizer, + return_tensors='tf' if is_tf_available() else 'pt') + return train_dataset, valid_dataset - def compile(self, learning_rate=3e-5, epsilon=1e-8, **kwargs): - if self.framework == 'tf': + def compile(self, learning_rate=3e-5, adam_epsilon=1e-8, **kwargs): + if is_tf_available(): logger.info('Preparing model') # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule - opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=epsilon) + opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=adam_epsilon) if USE_AMP: # loss scaling is currently required when using mixed precision opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') @@ -150,39 +157,37 @@ class TextClassificationPipeline(object): self.is_compiled = True - def fit(self, train_samples_text=None, train_samples_labels=None, - valid_samples_text=None, valid_samples_labels=None, - train_batch_size=None, valid_batch_size=None, + def fit(self, X=None, y=None, + validation_data=None, validation_split=0.1, + train_batch_size=None, + valid_batch_size=None, **kwargs): - # Generic compatibility with sklearn and Keras - if 'y' in kwargs and train_samples_labels is None: - train_samples_labels = kwargs.pop('y') - if 'X' in kwargs and train_samples_text is None: - train_samples_text = kwargs.pop('X') - if not self.is_compiled: self.compile(**kwargs) - datasets = self.prepare_data(train_samples_text, train_samples_labels, - valid_samples_text, valid_samples_labels, - validation_split) - train_dataset, valid_dataset, num_train_samples, num_valid_samples = datasets + train_dataset, valid_dataset = self.prepare_data(X, y=y, + validation_data=validation_data, + validation_split=validation_split) + num_train_samples = len(train_dataset) + num_valid_samples = len(valid_dataset) train_steps = num_train_samples//train_batch_size valid_steps = num_valid_samples//valid_batch_size - if self.framework == 'tf': + if is_tf_available(): # Prepare dataset as a tf.train_data.Dataset instance train_dataset = train_dataset.shuffle(128).batch(train_batch_size).repeat(-1) valid_dataset = valid_dataset.batch(valid_batch_size) logger.info('Training TF 2.0 model') history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, - validation_data=valid_dataset, validation_steps=valid_steps, **kwargs) + validation_data=valid_dataset, validation_steps=valid_steps, + **kwargs) else: raise NotImplementedError + self.is_trained = True @@ -210,9 +215,11 @@ class TextClassificationPipeline(object): if not self.is_trained: logger.error("Some weights of the model are not trained. Please fine-tune the model on a classification task before using it.") - inputs = self.tokenizer.batch_encode_plus(texts, add_special_tokens=True, return_tensors=self.framework) + inputs = self.tokenizer.batch_encode_plus(texts, + add_special_tokens=True, + return_tensors='tf' if is_tf_available() else 'pt') - if self.framework == 'tf': + if is_tf_available(): # TODO trace model predictions = self.model(**inputs)[0] else: From 81babb227e6d6505be088ac452f3cda8a14c2255 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 3 Dec 2019 14:56:57 +0100 Subject: [PATCH 067/302] Added download command through the cli. It allows to predownload models and tokenizers. --- transformers-cli | 4 +++- transformers/commands/download.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) mode change 100644 => 100755 transformers-cli create mode 100644 transformers/commands/download.py diff --git a/transformers-cli b/transformers-cli old mode 100644 new mode 100755 index 397b382308..168e6e6f32 --- a/transformers-cli +++ b/transformers-cli @@ -1,6 +1,7 @@ #!/usr/bin/env python from argparse import ArgumentParser +from transformers.commands.download import DownloadCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands from transformers.commands.train import TrainCommand @@ -11,10 +12,11 @@ if __name__ == '__main__': commands_parser = parser.add_subparsers(help='transformers-cli command helpers') # Register commands + ConvertCommand.register_subcommand(commands_parser) + DownloadCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) TrainCommand.register_subcommand(commands_parser) - ConvertCommand.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/transformers/commands/download.py b/transformers/commands/download.py new file mode 100644 index 0000000000..0938f135d2 --- /dev/null +++ b/transformers/commands/download.py @@ -0,0 +1,29 @@ +from argparse import ArgumentParser + +from transformers.commands import BaseTransformersCLICommand + + +def download_command_factory(args): + return DownloadCommand(args.model, args.cache_dir, args.force) + + +class DownloadCommand(BaseTransformersCLICommand): + + @staticmethod + def register_subcommand(parser: ArgumentParser): + download_parser = parser.add_parser('download') + download_parser.add_argument('--cache-dir', type=str, default=None, help='Path to location to store the models') + download_parser.add_argument('--force', action='store_true', help='Force the model to be download even if already in cache-dir') + download_parser.add_argument('model', type=str, help='Name of the model to download') + download_parser.set_defaults(func=download_command_factory) + + def __init__(self, model: str, cache: str, force: bool): + self._model = model + self._cache = cache + self._force = force + + def run(self): + from transformers import AutoModel, AutoTokenizer + + AutoModel.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) + AutoTokenizer.from_pretrained(self._model, cache_dir=self._cache, force_download=self._force) \ No newline at end of file From e1d89cb24d13d158966c190bb75ece38eae26746 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 6 Dec 2019 00:52:04 +0100 Subject: [PATCH 068/302] Added QuestionAnsweringPipeline with batch support. --- transformers/__init__.py | 7 +- transformers/pipeline.py | 229 -------------------------------------- transformers/pipelines.py | 222 ++++++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+), 232 deletions(-) mode change 100644 => 100755 transformers/__init__.py delete mode 100644 transformers/pipeline.py create mode 100755 transformers/pipelines.py diff --git a/transformers/__init__.py b/transformers/__init__.py old mode 100644 new mode 100755 index 26036d2e8d..4300409257 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -65,9 +65,6 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP -# Pipelines -from .pipeline import TextClassificationPipeline - # Modeling if is_torch_available(): from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) @@ -193,6 +190,10 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name load_tf2_weights_in_pytorch_model, load_tf2_model_in_pytorch_model) +# Pipelines +# from .pipeline_ import TextClassificationPipeline +from .pipelines import Pipeline, pipeline, TextClassificationPipeline + if not is_tf_available() and not is_torch_available(): logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found." "Models won't be available and only tokenizers, configuration" diff --git a/transformers/pipeline.py b/transformers/pipeline.py deleted file mode 100644 index 6e55ca4d7e..0000000000 --- a/transformers/pipeline.py +++ /dev/null @@ -1,229 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -""" Pipeline class: Tokenizer + Model. """ - -from __future__ import absolute_import, division, print_function, unicode_literals -import os -import logging -import six - -from .tokenization_auto import AutoTokenizer -from .file_utils import add_start_docstrings, is_tf_available, is_torch_available -from .data.processors import SingleSentenceClassificationProcessor - -if is_tf_available(): - import tensorflow as tf - from .modeling_tf_auto import (TFAutoModel, TFAutoModelForQuestionAnswering, - TFAutoModelForSequenceClassification, - TFAutoModelWithLMHead) -if is_torch_available(): - import torch - from .modeling_auto import (AutoModel, AutoModelForQuestionAnswering, - AutoModelForSequenceClassification, - AutoModelWithLMHead) - -logger = logging.getLogger(__name__) - -# TF training parameters -USE_XLA = False -USE_AMP = False - -class TextClassificationPipeline(object): - r""" - :class:`~transformers.TextClassificationPipeline` is a class encapsulating a pretrained model and - its tokenizer and will be instantiated as one of the base model classes of the library - when created with the `Pipeline.from_pretrained(pretrained_model_name_or_path)` - class method. - - The `from_pretrained()` method takes care of returning the correct model class instance - using pattern matching on the `pretrained_model_name_or_path` string. - - The base model class to instantiate is selected as the first pattern matching - in the `pretrained_model_name_or_path` string (in the following order): - - contains `distilbert`: DistilBertModel (DistilBERT model) - - contains `roberta`: RobertaModel (RoBERTa model) - - contains `bert`: BertModel (Bert model) - - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) - - contains `gpt2`: GPT2Model (OpenAI GPT-2 model) - - contains `ctrl`: CTRLModel (Salesforce CTRL model) - - contains `transfo-xl`: TransfoXLModel (Transformer-XL model) - - contains `xlnet`: XLNetModel (XLNet model) - - contains `xlm`: XLMModel (XLM model) - """ - def __init__(self, tokenizer, model, is_compiled=False, is_trained=False): - self.tokenizer = tokenizer - self.model = model - self.is_compiled = is_compiled - self.is_trained = is_trained - - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): - r""" Instantiates a pipeline from a pre-trained tokenizer and model. - """ - # Extract tokenizer and model arguments - tokenizer_kwargs = {} - for key in kwargs: - if key.startswith('tokenizer_'): - # Specific to the tokenizer - tokenizer_kwargs[key.replace('tokenizer_', '')] = kwargs.pop(key) - elif not key.startswith('model_'): - # used for both the tokenizer and the model - tokenizer_kwargs[key] = kwargs[key] - - model_kwargs = {} - for key in kwargs: - if key.startswith('model_'): - # Specific to the model - model_kwargs[key.replace('model_', '')] = kwargs.pop(key) - elif not key.startswith('tokenizer_'): - # used for both the tokenizer and the model - model_kwargs[key] = kwargs[key] - - tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, **tokenizer_kwargs) - model_kwargs['output_loading_info'] = True - if is_tf_available(): - model, loading_info = TFAutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) - else: - model, loading_info = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path, **model_kwargs) - - return cls(tokenizer, model, is_trained=bool(not loading_info['missing_keys'])) - - - def save_pretrained(self, save_directory): - if not os.path.isdir(save_directory): - logger.error("Saving directory ({}) should be a directory".format(save_directory)) - return - self.model.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) - - - def prepare_data(self, x, y=None, - validation_data=None, - validation_split=0.1, **kwargs): - dataset = x - if not isinstance(x, SingleSentenceClassificationProcessor): - dataset = SingleSentenceClassificationProcessor.create_from_examples(x, y) - num_data_samples = len(dataset) - - if validation_data is not None: - valid_dataset = validation_data - if not isinstance(validation_data, SingleSentenceClassificationProcessor): - valid_dataset = SingleSentenceClassificationProcessor.create_from_examples(validation_data) - - num_valid_samples = len(valid_dataset) - train_dataset = dataset - num_train_samples = num_data_samples - else: - assert 0.0 <= validation_split <= 1.0, "validation_split should be between 0.0 and 1.0" - num_valid_samples = max(int(num_data_samples * validation_split), 1) - num_train_samples = num_data_samples - num_valid_samples - train_dataset = dataset[num_valid_samples:] - valid_dataset = dataset[:num_valid_samples] - - logger.info('Tokenizing and processing dataset') - train_dataset = train_dataset.get_features(self.tokenizer, - return_tensors='tf' if is_tf_available() else 'pt') - valid_dataset = valid_dataset.get_features(self.tokenizer, - return_tensors='tf' if is_tf_available() else 'pt') - return train_dataset, valid_dataset - - - def compile(self, learning_rate=3e-5, adam_epsilon=1e-8, **kwargs): - if is_tf_available(): - logger.info('Preparing model') - # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule - opt = tf.keras.optimizers.Adam(learning_rate=learning_rate, epsilon=adam_epsilon) - if USE_AMP: - # loss scaling is currently required when using mixed precision - opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic') - loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True) - metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy') - self.model.compile(optimizer=opt, loss=loss, metrics=[metric]) - else: - raise NotImplementedError - self.is_compiled = True - - - def fit(self, X=None, y=None, - validation_data=None, - validation_split=0.1, - train_batch_size=None, - valid_batch_size=None, - **kwargs): - - if not self.is_compiled: - self.compile(**kwargs) - - train_dataset, valid_dataset = self.prepare_data(X, y=y, - validation_data=validation_data, - validation_split=validation_split) - num_train_samples = len(train_dataset) - num_valid_samples = len(valid_dataset) - - train_steps = num_train_samples//train_batch_size - valid_steps = num_valid_samples//valid_batch_size - - if is_tf_available(): - # Prepare dataset as a tf.train_data.Dataset instance - train_dataset = train_dataset.shuffle(128).batch(train_batch_size).repeat(-1) - valid_dataset = valid_dataset.batch(valid_batch_size) - - logger.info('Training TF 2.0 model') - history = self.model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps, - validation_data=valid_dataset, validation_steps=valid_steps, - **kwargs) - else: - raise NotImplementedError - - self.is_trained = True - - - def fit_transform(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - self.fit(*texts, **kwargs) - return self(*texts, **kwargs) - - - def transform(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) - - - def predict(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) - - - def __call__(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - if 'X' in kwargs and not texts: - texts = kwargs.pop('X') - - if not self.is_trained: - logger.error("Some weights of the model are not trained. Please fine-tune the model on a classification task before using it.") - - inputs = self.tokenizer.batch_encode_plus(texts, - add_special_tokens=True, - return_tensors='tf' if is_tf_available() else 'pt') - - if is_tf_available(): - # TODO trace model - predictions = self.model(**inputs)[0] - else: - with torch.no_grad(): - predictions = self.model(**inputs)[0] - - return predictions.numpy().tolist() diff --git a/transformers/pipelines.py b/transformers/pipelines.py new file mode 100755 index 0000000000..e85f2300e3 --- /dev/null +++ b/transformers/pipelines.py @@ -0,0 +1,222 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +from abc import ABC, abstractmethod +from typing import Union, Optional, Tuple + +import numpy as np + +from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedTokenizer, is_torch_available + +if is_tf_available(): + from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering +else: + from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering + + +class Pipeline(ABC): + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, **kwargs): + self.model = model + self.tokenizer = tokenizer + + @classmethod + @abstractmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + raise NotImplementedError() + + def save_pretrained(self, save_directory): + if not os.path.isdir(save_directory): + logger.error("Provided path ({}) should be a directory".format(save_directory)) + return + + self.model.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + def transform(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + def predict(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + @abstractmethod + def __call__(self, *texts, **kwargs): + raise NotImplementedError() + + +class TextClassificationPipeline(Pipeline): + def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): + super().__init__(model, tokenizer) + + if nb_classes < 2: + raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) + self._nb_classes = nb_classes + + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + return cls(model, tokenizer, **kwargs) + + def __call__(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + if 'X' in kwargs and not texts: + texts = kwargs.pop('X') + + inputs = self.tokenizer.batch_encode_plus( + texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) + + special_tokens_mask = inputs.pop('special_tokens_mask') + + if is_tf_available(): + # TODO trace model + predictions = self.model(**inputs)[0] + else: + import torch + with torch.no_grad(): + predictions = self.model(**inputs)[0] + + return predictions.numpy().tolist() + + +class QuestionAnsweringPipeline(Pipeline): + + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + pass + + def __call__(self, texts, **kwargs): + # Generic compatibility with sklearn and Keras + if 'X' in kwargs and not texts: + texts = kwargs.pop('X') + + if not isinstance(texts, (tuple, list)): + raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.') + + if not isinstance(texts, list): + texts = [texts] + + inputs = self.tokenizer.batch_encode_plus( + texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) + + # Remove special_tokens_mask to avoid KeyError + _ = inputs.pop('special_tokens_mask') + + if is_tf_available(): + # TODO trace model + start, end = self.model(inputs) + else: + import torch + with torch.no_grad(): + # Retrieve the score for the context tokens only (removing question tokens) + start, end = self.model(**inputs) + start, end = start.cpu().numpy(), end.cpu().numpy() + + answers = [] + for i in range(len(texts)): + context_idx = inputs['token_type_ids'][i] == 1 + start_, end_ = start[i, context_idx], end[i, context_idx] + + # Normalize logits and spans to retrieve the answer + start_, end_ = self.decode(start_, end_) + + # Convert the answer (tokens) back to the original text + answers += [{ + 'start': start_, + 'end': end_, + 'answer': self.span_to_answer(texts[i][1], start_, end_) + }] + + return answers + + def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple: + # Ensure we have batch axis + if start.ndim == 1: + start = start[None] + + if end.ndim == 1: + end = end[None] + + # Compute the score of each tuple(start, end) to be the real answer + outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) + + # Remove candidate with end < start and end - start > 15 + candidates = np.tril(np.triu(outer), 15) + + start = np.max(candidates, axis=2).argmax(-1) + end = np.max(candidates, axis=1).argmax(-1) + + return start, end + + def span_to_answer(self, text: str, start: int, end: int): + words, token_idx = [], 0 + + for i, word in enumerate(text.split(" ")): + token = self.tokenizer.tokenize(word) + + # Append words if they are in the span + if start <= token_idx <= end: + words += [word] + + # Stop if we went over the end of the answer + if token_idx > end: + break + + # Append the subtokenization length to the running index + token_idx += len(token) + + # Join text with spaces + return ' '.join(words) + + +# Register all the supported task here +SUPPORTED_TASKS = { + 'text-classification': { + 'impl': TextClassificationPipeline, + 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, + 'pt': AutoModelForSequenceClassification if is_torch_available() else None + }, + 'question-answering': { + 'impl': QuestionAnsweringPipeline, + 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, + 'pt': AutoModelForQuestionAnswering if is_torch_available() else None + } +} + + +def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: + """ + Utility factory method to build pipeline. + """ + # Try to infer tokenizer from model name (if provided as str) + if tokenizer is None and isinstance(model, str): + tokenizer = model + else: + # Impossible to guest what is the right tokenizer here + raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') + + tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) + + if task not in SUPPORTED_TASKS: + raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) + + targeted_task = SUPPORTED_TASKS[task] + task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + + model = allocator.from_pretrained(model) + return task(model, tokenizer, **kwargs) From 02110485b0980c2b0c8c4dc070643eff9c289cff Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 6 Dec 2019 18:11:27 +0100 Subject: [PATCH 069/302] Added batching, topk, chars index and scores. --- transformers/pipelines.py | 114 +++++++++++++++++++++++++++++--------- 1 file changed, 88 insertions(+), 26 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index e85f2300e3..f3b70908dd 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -16,7 +16,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os from abc import ABC, abstractmethod -from typing import Union, Optional, Tuple +from typing import Union, Optional, Tuple, List, Dict import numpy as np @@ -24,7 +24,8 @@ from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedToken if is_tf_available(): from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering -else: + +if is_torch_available(): from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering @@ -94,30 +95,71 @@ class TextClassificationPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline): + """ + Question Answering pipeling involving Tokenization and Inference. + TODO: + - top-k answers + - return start/end chars + - return score + """ + + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): + super().__init__(model, tokenizer) + + @staticmethod + def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[dict, List[Dict]]: + is_list = isinstance(question, list) + + if is_list: + return [{'question': q, 'context': c} for q, c in zip(question, context)] + else: + return {'question': question, 'context': context} @classmethod def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): pass - def __call__(self, texts, **kwargs): - # Generic compatibility with sklearn and Keras - if 'X' in kwargs and not texts: - texts = kwargs.pop('X') + def __call__(self, *texts, **kwargs): + # Set defaults values + kwargs.setdefault('max_answer_len', 15) + kwargs.setdefault('topk', 1) - if not isinstance(texts, (tuple, list)): - raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of tuple.') + if kwargs['topk'] < 1: + raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk'])) + + if kwargs['max_answer_len'] < 1: + raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) + + # Tabular input + if 'question' in kwargs and 'context' in kwargs: + texts = QuestionAnsweringPipeline.create_sample(kwargs['questions'], kwargs['contexts']) + elif 'data' in kwargs: + texts = kwargs['data'] + # Generic compatibility with sklearn and Keras + elif 'X' in kwargs and not texts: + texts = kwargs.pop('X') + else: + (texts, ) = texts + + if not isinstance(texts, (dict, list)): + raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.') if not isinstance(texts, list): texts = [texts] + # Map to tuple (question, context) + texts = [(text['question'], text['context']) for text in texts] + inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + # texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + texts, add_special_tokens=True, return_tensors='pt' ) # Remove special_tokens_mask to avoid KeyError _ = inputs.pop('special_tokens_mask') - if is_tf_available(): + # if is_tf_available(): + if False: # TODO trace model start, end = self.model(inputs) else: @@ -133,18 +175,19 @@ class QuestionAnsweringPipeline(Pipeline): start_, end_ = start[i, context_idx], end[i, context_idx] # Normalize logits and spans to retrieve the answer - start_, end_ = self.decode(start_, end_) + start_ = np.exp(start_) / np.sum(np.exp(start_)) + end_ = np.exp(end_) / np.sum(np.exp(end_)) + starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) # Convert the answer (tokens) back to the original text - answers += [{ - 'start': start_, - 'end': end_, - 'answer': self.span_to_answer(texts[i][1], start_, end_) - }] + answers += [[ + {**{'score': score}, **self.span_to_answer(texts[i][1], s, e)} + for s, e, score in zip(starts, ends, scores) + ]] return answers - def decode(self, start: np.ndarray, end: np.ndarray) -> Tuple: + def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: # Ensure we have batch axis if start.ndim == 1: start = start[None] @@ -155,22 +198,39 @@ class QuestionAnsweringPipeline(Pipeline): # Compute the score of each tuple(start, end) to be the real answer outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) - # Remove candidate with end < start and end - start > 15 - candidates = np.tril(np.triu(outer), 15) + # Remove candidate with end < start and end - start > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) - start = np.max(candidates, axis=2).argmax(-1) - end = np.max(candidates, axis=1).argmax(-1) + # start = np.max(candidates, axis=2).argmax(-1) + # end = np.max(candidates, axis=1).argmax(-1) - return start, end + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + start, end = np.unravel_index(idx_sort, candidates.shape)[1:] + return start, end, candidates[0, start, end] def span_to_answer(self, text: str, start: int, end: int): - words, token_idx = [], 0 + words = [] + token_idx = char_start_idx = char_end_idx = chars_idx = 0 for i, word in enumerate(text.split(" ")): token = self.tokenizer.tokenize(word) # Append words if they are in the span if start <= token_idx <= end: + if token_idx == start: + char_start_idx = chars_idx + + if token_idx == end: + char_end_idx = chars_idx + len(word) + words += [word] # Stop if we went over the end of the answer @@ -179,9 +239,10 @@ class QuestionAnsweringPipeline(Pipeline): # Append the subtokenization length to the running index token_idx += len(token) + chars_idx += len(word) + 1 # Join text with spaces - return ' '.join(words) + return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)} # Register all the supported task here @@ -193,7 +254,7 @@ SUPPORTED_TASKS = { }, 'question-answering': { 'impl': QuestionAnsweringPipeline, - 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, + # 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, 'pt': AutoModelForQuestionAnswering if is_torch_available() else None } } @@ -216,7 +277,8 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) targeted_task = SUPPORTED_TASKS[task] - task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + # task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + task, allocator = targeted_task['impl'], targeted_task['pt'] model = allocator.from_pretrained(model) return task(model, tokenizer, **kwargs) From 6e61e06051160812d401c07e5a4c77321191ec1e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 11:13:27 +0100 Subject: [PATCH 070/302] batch_encode_plus generates the encoder_attention_mask to avoid attending over padded values. --- transformers/pipelines.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index f3b70908dd..8b329abd24 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -132,7 +132,7 @@ class QuestionAnsweringPipeline(Pipeline): # Tabular input if 'question' in kwargs and 'context' in kwargs: - texts = QuestionAnsweringPipeline.create_sample(kwargs['questions'], kwargs['contexts']) + texts = QuestionAnsweringPipeline.create_sample(kwargs['question'], kwargs['context']) elif 'data' in kwargs: texts = kwargs['data'] # Generic compatibility with sklearn and Keras @@ -156,7 +156,10 @@ class QuestionAnsweringPipeline(Pipeline): ) # Remove special_tokens_mask to avoid KeyError - _ = inputs.pop('special_tokens_mask') + special_tokens_mask, input_len = inputs.pop('special_tokens_mask'), inputs.pop('input_len') + + # TODO : Harmonize model arguments across all model + inputs['attention_mask'] = inputs.pop('encoder_attention_mask') # if is_tf_available(): if False: From f116cf599cd979636bdf37df31be62088a1cb7e0 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 11:32:49 +0100 Subject: [PATCH 071/302] Allow hidding frameworks through environment variables (NO_TF, NO_TORCH). --- transformers/file_utils.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 24abd60781..4784681fb4 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -27,17 +27,25 @@ from contextlib import contextmanager logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: - import tensorflow as tf - assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 - _tf_available = True # pylint: disable=invalid-name - logger.info("TensorFlow version {} available.".format(tf.__version__)) + if 'NO_TF' in os.environ and os.environ['NO_TF'].upper() in ('1', 'ON'): + logger.info("Found NO_TF, disabling TensorFlow") + _tf_available = False + else: + import tensorflow as tf + assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 + _tf_available = True # pylint: disable=invalid-name + logger.info("TensorFlow version {} available.".format(tf.__version__)) except (ImportError, AssertionError): _tf_available = False # pylint: disable=invalid-name try: - import torch - _torch_available = True # pylint: disable=invalid-name - logger.info("PyTorch version {} available.".format(torch.__version__)) + if 'NO_TORCH' in os.environ and os.environ['NO_TORCH'].upper() in ('1', 'ON'): + logger.info("Found NO_TORCH, disabling PyTorch") + _torch_available = False + else: + import torch + _torch_available = True # pylint: disable=invalid-name + logger.info("PyTorch version {} available.".format(torch.__version__)) except ImportError: _torch_available = False # pylint: disable=invalid-name @@ -77,6 +85,7 @@ def is_torch_available(): return _torch_available def is_tf_available(): + return _tf_available if not six.PY2: From c2407fdd88719eed66227815188b5908eca4b3a7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 11:47:52 +0100 Subject: [PATCH 072/302] Enable the Tensorflow backend. --- transformers/pipelines.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 8b329abd24..e484958dcc 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -151,8 +151,7 @@ class QuestionAnsweringPipeline(Pipeline): texts = [(text['question'], text['context']) for text in texts] inputs = self.tokenizer.batch_encode_plus( - # texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' - texts, add_special_tokens=True, return_tensors='pt' + texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' ) # Remove special_tokens_mask to avoid KeyError @@ -161,10 +160,10 @@ class QuestionAnsweringPipeline(Pipeline): # TODO : Harmonize model arguments across all model inputs['attention_mask'] = inputs.pop('encoder_attention_mask') - # if is_tf_available(): - if False: + if is_tf_available(): # TODO trace model start, end = self.model(inputs) + start, end = start.numpy(), end.numpy() else: import torch with torch.no_grad(): @@ -204,9 +203,7 @@ class QuestionAnsweringPipeline(Pipeline): # Remove candidate with end < start and end - start > max_answer_len candidates = np.tril(np.triu(outer), max_answer_len - 1) - # start = np.max(candidates, axis=2).argmax(-1) - # end = np.max(candidates, axis=1).argmax(-1) - + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) scores_flat = candidates.flatten() if topk == 1: idx_sort = [np.argmax(scores_flat)] @@ -257,7 +254,7 @@ SUPPORTED_TASKS = { }, 'question-answering': { 'impl': QuestionAnsweringPipeline, - # 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, + 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, 'pt': AutoModelForQuestionAnswering if is_torch_available() else None } } @@ -280,8 +277,7 @@ def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenize raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) targeted_task = SUPPORTED_TASKS[task] - # task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] - task, allocator = targeted_task['impl'], targeted_task['pt'] + task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] model = allocator.from_pretrained(model) return task(model, tokenizer, **kwargs) From 348e19aa2104d59b91bc7216da5fcabf04f0bc5d Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 12:10:26 +0100 Subject: [PATCH 073/302] Expose attention_masks and input_lengths arguments to batch_encode_plus --- transformers/pipelines.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index e484958dcc..46fb735a70 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -149,14 +149,11 @@ class QuestionAnsweringPipeline(Pipeline): # Map to tuple (question, context) texts = [(text['question'], text['context']) for text in texts] - inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + texts, add_special_tokens=False, return_tensors='tf' if is_tf_available() else 'pt', + return_attention_masks=True, return_input_lengths=False ) - # Remove special_tokens_mask to avoid KeyError - special_tokens_mask, input_len = inputs.pop('special_tokens_mask'), inputs.pop('input_len') - # TODO : Harmonize model arguments across all model inputs['attention_mask'] = inputs.pop('encoder_attention_mask') From fe0f552e00e7556c9dd6eccc2486b962bb2a3460 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 14:13:17 +0100 Subject: [PATCH 074/302] Use attention_mask everywhere. --- transformers/pipelines.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 46fb735a70..57fe2f1357 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -154,9 +154,6 @@ class QuestionAnsweringPipeline(Pipeline): return_attention_masks=True, return_input_lengths=False ) - # TODO : Harmonize model arguments across all model - inputs['attention_mask'] = inputs.pop('encoder_attention_mask') - if is_tf_available(): # TODO trace model start, end = self.model(inputs) From a7d3794a298d77a1ae0c75c84ca963ac78058243 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 18:34:58 +0100 Subject: [PATCH 075/302] Remove token_type_ids for compatibility with DistilBert --- transformers/pipelines.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 57fe2f1357..1701915203 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -20,7 +20,7 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np -from transformers import is_tf_available, logger, AutoTokenizer, PreTrainedTokenizer, is_torch_available +from transformers import is_tf_available, is_torch_available, logger, AutoTokenizer, PreTrainedTokenizer if is_tf_available(): from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering @@ -154,6 +154,8 @@ class QuestionAnsweringPipeline(Pipeline): return_attention_masks=True, return_input_lengths=False ) + token_type_ids = inputs.pop('token_type_ids') + if is_tf_available(): # TODO trace model start, end = self.model(inputs) @@ -167,7 +169,7 @@ class QuestionAnsweringPipeline(Pipeline): answers = [] for i in range(len(texts)): - context_idx = inputs['token_type_ids'][i] == 1 + context_idx = token_type_ids[i] == 1 start_, end_ = start[i, context_idx], end[i, context_idx] # Normalize logits and spans to retrieve the answer From aae74065dff94465cdf6d92ccfd5dee030268885 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 9 Dec 2019 18:35:26 +0100 Subject: [PATCH 076/302] Added QuestionAnsweringPipeline unit tests. --- transformers/tests/pipelines_test.py | 83 ++++++++++++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 transformers/tests/pipelines_test.py diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py new file mode 100644 index 0000000000..36d6e013c3 --- /dev/null +++ b/transformers/tests/pipelines_test.py @@ -0,0 +1,83 @@ +import unittest +from unittest.mock import patch + + +QA_FINETUNED_MODELS = { + 'bert-large-uncased-whole-word-masking-finetuned-squad', + 'bert-large-cased-whole-word-masking-finetuned-squad', + 'distilbert-base-uncased-distilled-squad', + +} + + +class QuestionAnsweringPipelineTest(unittest.TestCase): + def check_answer_structure(self, answer, batch, topk): + self.assertIsInstance(answer, list) + self.assertEqual(len(answer), batch) + self.assertIsInstance(answer[0], list) + self.assertEqual(len(answer[0]), topk) + self.assertIsInstance(answer[0][0], dict) + + for item in answer[0]: + self.assertTrue('start' in item) + self.assertTrue('end' in item) + self.assertTrue('score' in item) + self.assertTrue('answer' in item) + + def question_answering_pipeline(self, nlp): + # Simple case with topk = 1, no batching + a = nlp(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.') + self.check_answer_structure(a, 1, 1) + + # Simple case with topk = 2, no batching + a = nlp(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.', topk=2) + self.check_answer_structure(a, 1, 2) + + # Batch case with topk = 1 + a = nlp(question=['What is the name of the company I\'m working for ?', 'Where is the company based ?'], + context=['I\'m working for Huggingface.', 'The company is based in New York and Paris']) + self.check_answer_structure(a, 2, 1) + + # Batch case with topk = 2 + a = nlp(question=['What is the name of the company I\'m working for ?', 'Where is the company based ?'], + context=['I\'m working for Huggingface.', 'The company is based in New York and Paris'], topk=2) + self.check_answer_structure(a, 2, 2) + + @patch('transformers.pipelines.is_torch_available', return_value=False) + def test_tf_models(self, is_torch_available): + from transformers import pipeline + for model in QA_FINETUNED_MODELS: + self.question_answering_pipeline(pipeline('question-answering', model)) + + @patch('transformers.pipelines.is_tf_available', return_value=False) + @patch('transformers.tokenization_utils.is_tf_available', return_value=False) + def test_torch_models(self, is_tf_available, _): + from transformers import pipeline + for model in QA_FINETUNED_MODELS: + self.question_answering_pipeline(pipeline('question-answering', model)) + + +class AutoPipelineTest(unittest.TestCase): + @patch('transformers.pipelines.is_torch_available', return_value=False) + def test_tf_qa(self, is_torch_available): + from transformers import pipeline + from transformers.pipelines import QuestionAnsweringPipeline + from transformers.modeling_tf_utils import TFPreTrainedModel + for model in QA_FINETUNED_MODELS: + nlp = pipeline('question-answering', model) + self.assertIsInstance(nlp, QuestionAnsweringPipeline) + self.assertIsInstance(nlp.model, TFPreTrainedModel) + + @patch('transformers.pipelines.is_tf_available', return_value=False) + def test_torch_qa(self, is_tf_available): + from transformers import pipeline + from transformers.pipelines import QuestionAnsweringPipeline + from transformers.modeling_utils import PreTrainedModel + for model in QA_FINETUNED_MODELS: + nlp = pipeline('question-answering', model) + self.assertIsInstance(nlp, QuestionAnsweringPipeline) + self.assertIsInstance(nlp.model, PreTrainedModel) + + +if __name__ == '__main__': + unittest.main() From 8ae1044f80ef543e4657c97d1030649d4da15aa8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 15:11:07 +0100 Subject: [PATCH 077/302] updating tests and TF 2.0 model --- transformers/modeling_t5.py | 31 ++++++--- transformers/modeling_tf_t5.py | 44 ++++++++++--- transformers/tests/modeling_common_test.py | 18 +++-- transformers/tests/modeling_t5_test.py | 9 ++- transformers/tests/modeling_tf_common_test.py | 65 +++++++++++-------- transformers/tests/modeling_tf_t5_test.py | 10 +-- transformers/tests/tokenization_t5_test.py | 1 - 7 files changed, 121 insertions(+), 57 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index e48293b49e..f1e4e0306c 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -726,8 +726,11 @@ class T5Model(T5PreTrainedModel): encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) encoder_attention_mask = kwargs_encoder.get("attention_mask", None) if encoder_hidden_states is None: - encoder_inputs_ids = kwargs_encoder.pop("input_ids") - hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + # Convert encoder inputs in embeddings if needed + hidden_states = kwargs_encoder.pop("inputs_embeds", None) + if hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings if encoder_attention_mask is not None: # Apply masking @@ -740,8 +743,12 @@ class T5Model(T5PreTrainedModel): encoder_outputs = () # Decode - decoder_inputs_ids = kwargs_decoder.pop("input_ids") - hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + # Convert decoder inputs in embeddings if needed + hidden_states = kwargs_decoder.pop("inputs_embeds", None) + if hidden_states is None: + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) @@ -825,16 +832,24 @@ class T5WithLMHeadModel(T5PreTrainedModel): # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) if encoder_hidden_states is None: - encoder_inputs_ids = kwargs_encoder.pop("input_ids") - hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + # Convert encoder inputs in embeddings if needed + hidden_states = kwargs_encoder.pop("inputs_embeds", None) + if hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) encoder_hidden_states = encoder_outputs[0] else: encoder_outputs = () # Decode - decoder_inputs_ids = kwargs_decoder.pop("input_ids") - hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + # Convert decoder inputs in embeddings if needed + hidden_states = kwargs_decoder.pop("inputs_embeds", None) + if hidden_states is None: + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 11762ee1e5..447fd69b05 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -613,6 +613,12 @@ class TFT5Model(TFT5PreTrainedModel): decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, name='decoder') + def get_input_embeddings(self): + return self.shared + + def get_output_embeddings(self): + return self.shared + def call(self, decoder_input_ids, **kwargs): # We allow two types of multi-inputs: # - traditional keyword arguments in the call method @@ -634,16 +640,24 @@ class TFT5Model(TFT5PreTrainedModel): # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) if encoder_hidden_states is None: - encoder_inputs_ids = kwargs_encoder.pop("input_ids") - hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + # Convert encoder inputs in embeddings if needed + hidden_states = kwargs_encoder.pop("inputs_embeds", None) + if hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) encoder_hidden_states = encoder_outputs[0] else: encoder_outputs = () # Decode - decoder_inputs_ids = kwargs_decoder.pop("input_ids") - hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + # Convert decoder inputs in embeddings if needed + hidden_states = kwargs_decoder.pop("inputs_embeds", None) + if hidden_states is None: + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) @@ -692,6 +706,12 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): decoder_config.is_decoder = True self.decoder = TFT5MainLayer(decoder_config, name='decoder') + def get_input_embeddings(self): + return self.shared + + def get_output_embeddings(self): + return self.shared + def call(self, decoder_input_ids, **kwargs): # We allow two types of multi-inputs: # - traditional keyword arguments in the call method @@ -713,16 +733,24 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): # Encode if needed (training, first prediction pass) encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) if encoder_hidden_states is None: - encoder_inputs_ids = kwargs_encoder.pop("input_ids") - hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + # Convert encoder inputs in embeddings if needed + hidden_states = kwargs_encoder.pop("inputs_embeds", None) + if hidden_states is None: + encoder_inputs_ids = kwargs_encoder.pop("input_ids") + hidden_states = self.shared(encoder_inputs_ids) # Convert inputs in embeddings + encoder_outputs = self.encoder(hidden_states, **kwargs_encoder) encoder_hidden_states = encoder_outputs[0] else: encoder_outputs = () # Decode - decoder_inputs_ids = kwargs_decoder.pop("input_ids") - hidden_states = self.shared(decoder_inputs_ids) # Convert inputs in embeddings + # Convert decoder inputs in embeddings if needed + hidden_states = kwargs_decoder.pop("inputs_embeds", None) + if hidden_states is None: + decoder_inputs_ids = kwargs_decoder.pop("input_ids") + hidden_states = self.shared(decoder_inputs_ids) + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None) decoder_outputs = self.decoder(hidden_states, **kwargs_decoder) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index cdfbfc09e2..792f5cee3e 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -568,8 +568,14 @@ class CommonTestCases: def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict["input_ids"] - del inputs_dict["input_ids"] + if not self.is_encoder_decoder: + input_ids = inputs_dict["input_ids"] + del inputs_dict["input_ids"] + else: + encoder_input_ids = inputs_dict["encoder_input_ids"] + decoder_input_ids = inputs_dict["decoder_input_ids"] + del inputs_dict["encoder_input_ids"] + del inputs_dict["decoder_input_ids"] for model_class in self.all_model_classes: model = model_class(config) @@ -577,9 +583,13 @@ class CommonTestCases: model.eval() wte = model.get_input_embeddings() - inputs_dict["inputs_embeds"] = wte(input_ids) - outputs = model(**inputs_dict) + if not self.is_encoder_decoder: + inputs_dict["inputs_embeds"] = wte(input_ids) + else: + inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids) + inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids) + outputs = model(**inputs_dict) class GPTModelTester(CommonModelTester): diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index 091bd742b5..a539cc868a 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -18,20 +18,19 @@ from __future__ import print_function import unittest import shutil -import pytest from transformers import is_torch_available -from .modeling_common_test import (CommonTestCases, ids_tensor) +from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) from .configuration_common_test import ConfigTester +from .utils import require_torch, slow, torch_device if is_torch_available(): from transformers import (T5Config, T5Model, T5WithLMHeadModel) from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP -else: - pytestmark = pytest.mark.skip("Require Torch") +@require_torch class T5ModelTest(CommonTestCases.CommonModelTester): all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else () @@ -174,7 +173,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs) - @pytest.mark.slow + @slow def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index 8957313021..a0d63583fb 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -130,12 +130,12 @@ class TFCommonTestCases: for name, key in inputs_dict.items()) with torch.no_grad(): pto = pt_model(**pt_inputs_dict) - tfo = tf_model(inputs_dict) - tfo = tfo[0].numpy() - pto = pto[0].numpy() - tfo[np.isnan(tfo)] = 0 - pto[np.isnan(pto)] = 0 - max_diff = np.amax(np.abs(tfo - pto)) + tfo = tf_model(inputs_dict, training=False) + tf_hidden_states = tfo[0].numpy() + pt_hidden_states = pto[0].numpy() + tf_hidden_states[np.isnan(tf_hidden_states)] = 0 + pt_hidden_states[np.isnan(pt_hidden_states)] = 0 + max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states)) self.assertLessEqual(max_diff, 2e-2) # Check we can load pt model in tf and vice-versa with checkpoint => model functions @@ -296,33 +296,46 @@ class TFCommonTestCases: first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] self.assertTrue(tf.math.equal(first, second).numpy().all()) + def _get_embeds(self, wte, input_ids): + # ^^ In our TF models, the input_embeddings can take slightly different forms, + # so we try a few of them. + # We used to fall back to just synthetically creating a dummy tensor of ones: + try: + x = wte(input_ids, mode="embedding") + except: + try: + x = wte([input_ids], mode="embedding") + except: + try: + x = wte([input_ids, None, None, None], mode="embedding") + except: + if hasattr(self.model_tester, "embedding_size"): + x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32) + else: + x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32) + return x + def test_inputs_embeds(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - input_ids = inputs_dict["input_ids"] - del inputs_dict["input_ids"] + if not self.is_encoder_decoder: + input_ids = inputs_dict["input_ids"] + del inputs_dict["input_ids"] + else: + encoder_input_ids = inputs_dict["encoder_input_ids"] + decoder_input_ids = inputs_dict["decoder_input_ids"] + del inputs_dict["encoder_input_ids"] + del inputs_dict["decoder_input_ids"] for model_class in self.all_model_classes: model = model_class(config) wte = model.get_input_embeddings() - try: - x = wte(input_ids, mode="embedding") - except: - try: - x = wte([input_ids], mode="embedding") - except: - try: - x = wte([input_ids, None, None, None], mode="embedding") - except: - if hasattr(self.model_tester, "embedding_size"): - x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32) - else: - x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32) - # ^^ In our TF models, the input_embeddings can take slightly different forms, - # so we try a few of them. - # We used to fall back to just synthetically creating a dummy tensor of ones: - # - inputs_dict["inputs_embeds"] = x + if not self.is_encoder_decoder: + inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids) + else: + inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids) + inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids) + outputs = model(inputs_dict) diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py index 33f6f895f0..99eec313f9 100644 --- a/transformers/tests/modeling_tf_t5_test.py +++ b/transformers/tests/modeling_tf_t5_test.py @@ -18,21 +18,21 @@ from __future__ import print_function import unittest import shutil -import pytest import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester +from .utils import require_tf, slow from transformers import T5Config, is_tf_available if is_tf_available(): import tensorflow as tf - from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) -else: - pytestmark = pytest.mark.skip("Require TensorFlow") + from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel, + TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP) +@require_tf class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): is_encoder_decoder = True @@ -160,7 +160,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs) - @pytest.mark.slow + @slow def test_model_from_pretrained(self): cache_dir = "/tmp/transformers_test/" for model_name in ['t5-small']: diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py index aabb21e443..0b4f960e32 100644 --- a/transformers/tests/tokenization_t5_test.py +++ b/transformers/tests/tokenization_t5_test.py @@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest -import pytest from transformers.tokenization_t5 import (T5Tokenizer) from transformers.tokenization_xlnet import SPIECE_UNDERLINE From 4b82c485de187896a38c441587b7bd4d04f2821e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Tue, 10 Dec 2019 14:49:53 +0100 Subject: [PATCH 078/302] remove misplaced summarization documentation --- examples/README.md | 30 ------------------------------ 1 file changed, 30 deletions(-) diff --git a/examples/README.md b/examples/README.md index 620304ea77..b6b3908810 100644 --- a/examples/README.md +++ b/examples/README.md @@ -24,8 +24,6 @@ pip install -r ./examples/requirements.txt | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training. | | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. | -| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs -model finetuned on the CNN/DailyMail dataset to generate summaries. | ## TensorFlow 2.0 Bert models on GLUE @@ -646,34 +644,6 @@ micro avg 0.8722 0.8774 0.8748 13869 macro avg 0.8712 0.8774 0.8740 13869 ``` -## Abstractive summarization - -Based on the script -[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py). - -Before running this script you should download **both** CNN and Daily Mail -datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) (the -links next to "Stories") in the same folder. Then uncompress the archives by running: - -```bash -tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz -``` - -note that the finetuning script **will not work** if you do not download both -datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both -archive. - -```bash -export DATA_PATH=/path/to/dataset/ - -python run_summarization_finetuning.py \ - --output_dir=output \ - --model_type=bert2bert \ - --model_name_or_path=bert2bert \ - --do_train \ - --data_path=$DATA_PATH \ -``` - ## XNLI Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py). From 981a5c8c1789f91204ba1053f4742f6ea8c615af Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 15:36:19 +0100 Subject: [PATCH 079/302] updating models urls --- transformers/configuration_t5.py | 4 ++++ transformers/convert_pytorch_checkpoint_to_tf2.py | 2 +- transformers/modeling_t5.py | 4 ++++ transformers/modeling_tf_t5.py | 6 +++++- transformers/tokenization_t5.py | 12 ++++++++++-- 5 files changed, 24 insertions(+), 4 deletions(-) diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 83aab66fac..2ccdebc2b1 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -28,6 +28,10 @@ logger = logging.getLogger(__name__) T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", + 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", + 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", + 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json", + 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json", } diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index 4c4becfa00..06bb5f47c0 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -121,7 +121,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file if compare_with_pt_model: inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] - tf_inputs = tf.constant(inputs_list) + tf_inputs = tf_model.dummy_inputs tfo = tf_model(tf_inputs, training=False) # build the network pt_model = pt_model_class.from_pretrained(None, diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index f1e4e0306c..ffc4d8bb3f 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -42,6 +42,10 @@ logger = logging.getLogger(__name__) #################################################### T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", + 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin", + 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin", + 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin", + 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin", } #################################################### diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 447fd69b05..0b3b1116f2 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -25,13 +25,17 @@ import itertools import tensorflow as tf from .configuration_t5 import T5Config -from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer, DUMMY_INPUTS +from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", + 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5", + 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5", + 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5", + 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5", } #################################################### diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 933084d13a..62e9c069e2 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -41,7 +41,11 @@ VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { - 't5': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } @@ -49,7 +53,11 @@ PRETRAINED_VOCAB_FILES_MAP = { # Mapping from model shortcut names to max length of inputs #################################################### PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 't5': 512, + 't5-small': 512, + 't5-base': 512, + 't5-large': 512, + 't5-3B': 512, + 't5-11B': 512, } class T5Tokenizer(PreTrainedTokenizer): From 40a39ab65043f11763d8f0ce5fb0307661e6f7a3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 10 Dec 2019 15:59:38 +0100 Subject: [PATCH 080/302] Reuse recent SQuAD refactored data structure inside QA pipelines. --- transformers/data/processors/__init__.py | 2 +- transformers/modeling_auto.py | 12 +--- transformers/pipelines.py | 84 ++++++++++++++++-------- 3 files changed, 59 insertions(+), 39 deletions(-) diff --git a/transformers/data/processors/__init__.py b/transformers/data/processors/__init__.py index 0cef0080f4..4f7307bb7b 100644 --- a/transformers/data/processors/__init__.py +++ b/transformers/data/processors/__init__.py @@ -1,4 +1,4 @@ -from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor, convert_examples_to_features +from .utils import InputExample, InputFeatures, DataProcessor, SingleSentenceClassificationProcessor from .glue import glue_output_modes, glue_processors, glue_tasks_num_labels, glue_convert_examples_to_features from .squad import squad_convert_examples_to_features, SquadFeatures, SquadExample, SquadV1Processor, SquadV2Processor from .xnli import xnli_output_modes, xnli_processors, xnli_tasks_num_labels \ No newline at end of file diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 0c8bffa883..041115cc61 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -31,7 +31,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification -from .modeling_camembert import CamembertModel, CamembertForQuestionAnswering, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice +from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering from .modeling_utils import PreTrainedModel, SequenceSummary @@ -294,10 +294,6 @@ class AutoModelWithLMHead(object): return XLMWithLMHeadModel(config) elif isinstance(config, CTRLConfig): return CTRLLMHeadModel(config) - elif isinstance(config, AlbertConfig): - return AlbertLMHeadModel(config) - elif isinstance(config, CamembertConfig): - return CamembertLMHeadModel(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -454,7 +450,7 @@ class AutoModelForSequenceClassification(object): """ if isinstance(config, AlbertConfig): return AlbertForSequenceClassification(config) - elif isintance(config, CamembertConfig): + elif isinstance(config, CamembertConfig): return CamembertForSequenceClassification(config) elif isinstance(config, DistilBertConfig): return DistilBertForSequenceClassification(config) @@ -606,10 +602,8 @@ class AutoModelForQuestionAnswering(object): config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. model = AutoModelForSequenceClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` """ - if isintance(config, AlbertConfig): + if isinstance(config, AlbertConfig): return AlbertForQuestionAnswering(config) - elif isintance(config, CamembertConfig): - return CamembertForQuestionAnswering(config) elif isinstance(config, DistilBertConfig): return DistilBertForQuestionAnswering(config) elif isinstance(config, BertConfig): diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 1701915203..1e2f035d9f 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -20,7 +20,8 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np -from transformers import is_tf_available, is_torch_available, logger, AutoTokenizer, PreTrainedTokenizer +from transformers import is_tf_available, is_torch_available, logger, AutoTokenizer, PreTrainedTokenizer, \ + SquadExample, squad_convert_examples_to_features if is_tf_available(): from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering @@ -107,13 +108,28 @@ class QuestionAnsweringPipeline(Pipeline): super().__init__(model, tokenizer) @staticmethod - def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[dict, List[Dict]]: + def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: is_list = isinstance(question, list) if is_list: - return [{'question': q, 'context': c} for q, c in zip(question, context)] + return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] else: - return {'question': question, 'context': context} + return SquadExample(None, question, context, None, None, None) + + def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: + args = ['input_ids', 'attention_mask'] + model_type = type(self.model).__name__.lower() + + if 'distilbert' not in model_type and 'xlm' not in model_type: + args += ['token_type_ids'] + + if 'xlnet' in model_type or 'xlm' in model_type: + args += ['cls_index', 'p_mask'] + + if isinstance(features, SquadExample): + return {k: features.__dict__[k] for k in args} + else: + return {k: [feature.__dict__[k] for feature in features] for k in args} @classmethod def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): @@ -121,8 +137,11 @@ class QuestionAnsweringPipeline(Pipeline): def __call__(self, *texts, **kwargs): # Set defaults values - kwargs.setdefault('max_answer_len', 15) kwargs.setdefault('topk', 1) + kwargs.setdefault('doc_stride', 128) + kwargs.setdefault('max_answer_len', 15) + kwargs.setdefault('max_seq_len', 384) + kwargs.setdefault('max_question_len', 64) if kwargs['topk'] < 1: raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk'])) @@ -130,56 +149,63 @@ class QuestionAnsweringPipeline(Pipeline): if kwargs['max_answer_len'] < 1: raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) - # Tabular input - if 'question' in kwargs and 'context' in kwargs: - texts = QuestionAnsweringPipeline.create_sample(kwargs['question'], kwargs['context']) - elif 'data' in kwargs: - texts = kwargs['data'] + # Position args + if texts is not None and len(texts) > 1: + (texts, ) = texts + # Generic compatibility with sklearn and Keras elif 'X' in kwargs and not texts: texts = kwargs.pop('X') - else: - (texts, ) = texts - if not isinstance(texts, (dict, list)): - raise Exception('QuestionAnsweringPipeline requires predict argument to be a tuple (context, question) or a List of dict.') + # Batched data + elif 'data' in kwargs: + texts = kwargs.pop('data') + + # Tabular input + elif 'question' in kwargs and 'context' in kwargs: + texts = QuestionAnsweringPipeline.create_sample(kwargs['question'], kwargs['context']) + else: + raise ValueError('Unknown arguments {}'.format(kwargs)) if not isinstance(texts, list): texts = [texts] - # Map to tuple (question, context) - texts = [(text['question'], text['context']) for text in texts] - inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=False, return_tensors='tf' if is_tf_available() else 'pt', - return_attention_masks=True, return_input_lengths=False - ) - - token_type_ids = inputs.pop('token_type_ids') + # Convert inputs to features + features = squad_convert_examples_to_features(texts, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) + fw_args = self.inputs_for_model(features) if is_tf_available(): - # TODO trace model - start, end = self.model(inputs) + import tensorflow as tf + fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} + start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() else: import torch with torch.no_grad(): # Retrieve the score for the context tokens only (removing question tokens) - start, end = self.model(**inputs) + fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} + start, end = self.model(**fw_args) start, end = start.cpu().numpy(), end.cpu().numpy() answers = [] - for i in range(len(texts)): - context_idx = token_type_ids[i] == 1 - start_, end_ = start[i, context_idx], end[i, context_idx] + for i, (example, feature, start_, end_) in enumerate(zip(texts, features, start, end)): + start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) # Normalize logits and spans to retrieve the answer start_ = np.exp(start_) / np.sum(np.exp(start_)) end_ = np.exp(end_) / np.sum(np.exp(end_)) starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) + char_to_word = np.array(example.char_to_word_offset) + # Convert the answer (tokens) back to the original text answers += [[ - {**{'score': score}, **self.span_to_answer(texts[i][1], s, e)} + { + 'score': score, + 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0], + 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1], + 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) + } for s, e, score in zip(starts, ends, scores) ]] From a5df980c5b86e9106382a87a63b977d5decf97f6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 16:01:15 +0100 Subject: [PATCH 081/302] updating distilbert test --- transformers/tests/modeling_common_test.py | 7 ++++++- transformers/tests/modeling_tf_common_test.py | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 792f5cee3e..2f2baff436 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -121,7 +121,12 @@ class CommonTestCases: model.to(torch_device) model.eval() first, second = model(**inputs_dict)[0], model(**inputs_dict)[0] - self.assertEqual(first.ne(second).sum().item(), 0) + out_1 = first.cpu().numpy() + out_2 = second.cpu().numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) def test_attention_outputs(self): config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py index a0d63583fb..5a5873e81b 100644 --- a/transformers/tests/modeling_tf_common_test.py +++ b/transformers/tests/modeling_tf_common_test.py @@ -294,7 +294,12 @@ class TFCommonTestCases: for model_class in self.all_model_classes: model = model_class(config) first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0] - self.assertTrue(tf.math.equal(first, second).numpy().all()) + out_1 = first.numpy() + out_2 = second.numpy() + out_1 = out_1[~np.isnan(out_1)] + out_2 = out_2[~np.isnan(out_2)] + max_diff = np.amax(np.abs(out_1 - out_2)) + self.assertLessEqual(max_diff, 1e-5) def _get_embeds(self, wte, input_ids): # ^^ In our TF models, the input_embeddings can take slightly different forms, From f2538c12741df74abbd2ff38f43019cfbb21093b Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 16:33:11 +0100 Subject: [PATCH 082/302] all tests in torch no grad --- transformers/tests/modeling_common_test.py | 53 ++++++++++++++-------- 1 file changed, 35 insertions(+), 18 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 2f2baff436..ed6f950e25 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -120,7 +120,9 @@ class CommonTestCases: model = model_class(config) model.to(torch_device) model.eval() - first, second = model(**inputs_dict)[0], model(**inputs_dict)[0] + with torch.no_grad(): + first = model(**inputs_dict)[0] + second = model(**inputs_dict)[0] out_1 = first.cpu().numpy() out_2 = second.cpu().numpy() out_1 = out_1[~np.isnan(out_1)] @@ -142,7 +144,8 @@ class CommonTestCases: model = model_class(config) model.to(torch_device) model.eval() - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, False) @@ -173,7 +176,8 @@ class CommonTestCases: model = model_class(config) model.to(torch_device) model.eval() - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs)) self.assertEqual(model.config.output_attentions, True) self.assertEqual(model.config.output_hidden_states, True) @@ -273,7 +277,8 @@ class CommonTestCases: inputs = inputs_dict.copy() inputs['head_mask'] = head_mask - outputs = model(**inputs) + with torch.no_grad(): + outputs = model(**inputs) # Test that we can get a gradient back for importance score computation output = sum(t.sum() for t in outputs[0]) @@ -320,7 +325,8 @@ class CommonTestCases: heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} model.prune_heads(heads_to_prune) - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] @@ -356,7 +362,8 @@ class CommonTestCases: model = model_class.from_pretrained(directory) model.to(torch_device) - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) @@ -385,7 +392,8 @@ class CommonTestCases: model.to(torch_device) model.eval() - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], 1) @@ -412,7 +420,8 @@ class CommonTestCases: model.to(torch_device) model.eval() - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) @@ -429,7 +438,8 @@ class CommonTestCases: model.to(torch_device) shutil.rmtree(directory) - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1) @@ -440,7 +450,8 @@ class CommonTestCases: heads_to_prune = {0: [0], 2: [1, 2]} model.prune_heads(heads_to_prune) - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) attentions = outputs[-1] self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1) @@ -459,7 +470,8 @@ class CommonTestCases: model = model_class(config) model.to(torch_device) model.eval() - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) hidden_states = outputs[-1] self.assertEqual(model.config.output_attentions, False) self.assertEqual(model.config.output_hidden_states, True) @@ -594,7 +606,8 @@ class CommonTestCases: inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids) inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids) - outputs = model(**inputs_dict) + with torch.no_grad(): + outputs = model(**inputs_dict) class GPTModelTester(CommonModelTester): @@ -682,9 +695,10 @@ class CommonTestCases: model.to(torch_device) model.eval() - outputs = model(input_ids, position_ids, token_type_ids) - outputs = model(input_ids, position_ids) - outputs = model(input_ids) + with torch.no_grad(): + outputs = model(input_ids, position_ids, token_type_ids) + outputs = model(input_ids, position_ids) + outputs = model(input_ids) hidden_state = outputs[0] self.parent.assertListEqual( @@ -697,7 +711,8 @@ class CommonTestCases: model = self.lm_head_model_class(config) model.to(torch_device) model.eval() - outputs = model(input_ids, position_ids, token_type_ids, lm_labels) + with torch.no_grad(): + outputs = model(input_ids, position_ids, token_type_ids, lm_labels) loss, lm_logits = outputs[:2] total_voc = self.vocab_size @@ -714,7 +729,8 @@ class CommonTestCases: model = model_class(config) model.to(torch_device) model.eval() - outputs = model(input_ids) + with torch.no_grad(): + outputs = model(input_ids) presents = outputs[-1] self.parent.assertEqual(self.num_hidden_layers, len(presents)) self.parent.assertListEqual( @@ -727,7 +743,8 @@ class CommonTestCases: model = self.double_head_model_class(config) model.to(torch_device) model.eval() - outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, + with torch.no_grad(): + outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels, token_type_ids=token_type_ids, position_ids=position_ids) lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4] loss = [lm_loss, mc_loss] From 63e36007ee152cee23f44103622f28566e28fb72 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 10 Dec 2019 16:47:35 +0100 Subject: [PATCH 083/302] Make sure padding, cls and another non-context tokens cannot appear in the answer. --- transformers/pipelines.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 1e2f035d9f..eec4932321 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -188,14 +188,18 @@ class QuestionAnsweringPipeline(Pipeline): start, end = start.cpu().numpy(), end.cpu().numpy() answers = [] - for i, (example, feature, start_, end_) in enumerate(zip(texts, features, start, end)): - start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) - + for (example, feature, start_, end_) in zip(texts, features, start, end): # Normalize logits and spans to retrieve the answer start_ = np.exp(start_) / np.sum(np.exp(start_)) end_ = np.exp(end_) / np.sum(np.exp(end_)) - starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) + # Mask padding and question + start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) + + # Mask CLS + start_[0] = end_[0] = 0 + + starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text From 07bc8efbc30f88e25d78b66811d670584a1bb97b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Fri, 15 Nov 2019 10:51:38 +0100 Subject: [PATCH 084/302] add greedy decoding and sampling --- examples/run_generation.py | 292 ++++++++++------------- transformers/modeling_encoder_decoder.py | 162 ++++++++++--- transformers/modeling_transfo_xl.py | 10 +- transformers/modeling_utils.py | 229 ++++++++++++++++++ transformers/modeling_xlm.py | 29 ++- transformers/modeling_xlnet.py | 34 +++ transformers/tests/sampling_test.py | 213 +++++++++++++++++ 7 files changed, 776 insertions(+), 193 deletions(-) create mode 100644 transformers/tests/sampling_test.py diff --git a/examples/run_generation.py b/examples/run_generation.py index 2d917660cf..2075ad8457 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -20,14 +20,10 @@ from __future__ import absolute_import, division, print_function, unicode_litera import argparse import logging -from tqdm import trange import torch -import torch.nn.functional as F import numpy as np -from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig - from transformers import GPT2LMHeadModel, GPT2Tokenizer from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer from transformers import XLNetLMHeadModel, XLNetTokenizer @@ -36,22 +32,22 @@ from transformers import CTRLLMHeadModel, CTRLTokenizer from transformers import XLMWithLMHeadModel, XLMTokenizer -logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', - datefmt = '%m/%d/%Y %H:%M:%S', - level = logging.INFO) +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) logger = logging.getLogger(__name__) MAX_LENGTH = int(10000) # Hardcoded max length to avoid infinite loop -ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig, CTRLConfig)), ()) - MODEL_CLASSES = { - 'gpt2': (GPT2LMHeadModel, GPT2Tokenizer), - 'ctrl': (CTRLLMHeadModel, CTRLTokenizer), - 'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), - 'xlnet': (XLNetLMHeadModel, XLNetTokenizer), - 'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer), - 'xlm': (XLMWithLMHeadModel, XLMTokenizer), + "gpt2": (GPT2LMHeadModel, GPT2Tokenizer), + "ctrl": (CTRLLMHeadModel, CTRLTokenizer), + "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer), + "xlnet": (XLNetLMHeadModel, XLNetTokenizer), + "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer), + "xlm": (XLMWithLMHeadModel, XLMTokenizer), } # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia @@ -75,81 +71,78 @@ def set_seed(args): if args.n_gpu > 0: torch.cuda.manual_seed_all(args.seed) - -def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): - """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering - Args: - logits: logits distribution shape (batch size x vocabulary size) - top_k > 0: keep only top k tokens with highest probability (top-k filtering). - top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). - Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) - From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 - """ - top_k = min(top_k, logits.size(-1)) # Safety check - if top_k > 0: - # Remove all tokens with a probability less than the last token of the top-k - indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] - logits[indices_to_remove] = filter_value - - if top_p > 0.0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) - - # Remove tokens with cumulative probability above the threshold - sorted_indices_to_remove = cumulative_probs > top_p - # Shift the indices to the right to keep also the first token above the threshold - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) - logits[indices_to_remove] = filter_value - return logits +# +# Functions to prepare models' input +# -def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0, - is_xlnet=False, is_xlm_mlm=False, xlm_mask_token=None, xlm_lang=None, device='cpu'): - context = torch.tensor(context, dtype=torch.long, device=device) - context = context.unsqueeze(0).repeat(num_samples, 1) - generated = context - with torch.no_grad(): - for _ in trange(length): +def prepare_ctrl_input(args, _, tokenizer, prompt_text): + if args.temperature > 0.7: + logger.info( + "CTRL typically works better with lower temperatures (and lower top_k)." + ) - inputs = {'input_ids': generated} - if is_xlnet: - # XLNet is a direct (predict same token, not next token) and bi-directional model by default - # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring) - input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1) - perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device) - perm_mask[:, :, -1] = 1.0 # Previous tokens don't see last token - target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device) - target_mapping[0, 0, -1] = 1.0 # predict last token - inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping} + encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False) + if not any(encoded_prompt[0] == x for x in tokenizer.control_codes.values()): + logger.info( + "WARNING! You are not starting your generation from a control code so you won't get good results" + ) + return prompt_text, {} - if is_xlm_mlm and xlm_mask_token: - # XLM MLM models are direct models (predict same token, not next token) - # => need one additional dummy token in the input (will be masked and guessed) - input_ids = torch.cat((generated, torch.full((1, 1), xlm_mask_token, dtype=torch.long, device=device)), dim=1) - inputs = {'input_ids': input_ids} - if xlm_lang is not None: - inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1) +def prepare_xlm_input(args, model, tokenizer, prompt_text): + kwargs = {"language": None, "mask_token": None} - outputs = model(**inputs) # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states) - next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.) + # Set the language + use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb + if hasattr(model.config, "lang2id") and use_lang_emb: + available_languages = model.config.lang2id.keys() + if args.xlm_language in available_languages: + language = args.xlm_language + else: + language = None + while language not in available_languages: + language = input( + "Using XLM. Select language in " + + str(list(available_languages)) + + " >>> " + ) + kwargs["language"] = tokenizer.lang2id[language] - # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858) - for i in range(num_samples): - for _ in set(generated[i].tolist()): - next_token_logits[i, _] /= repetition_penalty - - filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) - if temperature == 0: # greedy sampling: - next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1) - else: - next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1) - generated = torch.cat((generated, next_token), dim=1) - return generated + # XLM masked-language modeling (MLM) models need masked token + is_xlm_mlm = "mlm" in args.model_name_or_path + if is_xlm_mlm: + kwargs["mask_token"] = tokenizer.mask_token_id + + return prompt_text, kwargs + + +def prepare_xlnet_input(args, _, tokenizer, prompt_text): + prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text + return prompt_text, {} + + +def prepare_transfoxl_input(args, _, tokenizer, prompt_text): + prompt_text = (args.padding_text if args.padding_text else PADDING_TEXT) + prompt_text + return prompt_text, {} + + +PREPROCESSING_FUNCTIONS = { + "ctrl": prepare_ctrl_input, + "xlm": prepare_xlm_input, + "xlnet": prepare_xlnet_input, + "transfo-xl": prepare_transfoxl_input, +} + + +def adjust_length_to_model(length, max_sequence_length): + if length < 0 and max_sequence_length > 0: + length = max_sequence_length + elif 0 < max_sequence_length < length: + length = max_sequence_length # No generation bigger than model size + elif length < 0: + length = MAX_LENGTH # avoid infinite loop + return length def main(): @@ -157,104 +150,81 @@ def main(): parser.add_argument("--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys())) parser.add_argument("--model_name_or_path", default=None, type=str, required=True, - help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS)) + help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys())) + parser.add_argument("--prompt", type=str, default="") - parser.add_argument("--padding_text", type=str, default="") - parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.") parser.add_argument("--length", type=int, default=20) - parser.add_argument("--num_samples", type=int, default=1) - parser.add_argument("--temperature", type=float, default=1.0, - help="temperature of 0 implies greedy sampling") - parser.add_argument("--repetition_penalty", type=float, default=1.0, - help="primarily useful for CTRL model; in that case, use 1.2") - parser.add_argument("--top_k", type=int, default=0) - parser.add_argument("--top_p", type=float, default=0.9) - parser.add_argument("--no_cuda", action='store_true', - help="Avoid using CUDA when available") - parser.add_argument('--seed', type=int, default=42, - help="random seed for initialization") - parser.add_argument('--stop_token', type=str, default=None, - help="Token at which text generation is stopped") + parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") + + parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 0 implies greedy sampling") + parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2") + parser.add_argument("--k", type=int, default=0) + parser.add_argument("--p", type=float, default=0.9) + + parser.add_argument("--padding_text", type=str, default="", help="Padding text for Transfo-XL and XLNet.") + parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.") + + parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") + parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") args = parser.parse_args() - args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") + args.device = torch.device( + "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu" + ) args.n_gpu = torch.cuda.device_count() set_seed(args) - args.model_type = args.model_type.lower() - model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + # Initialize the model and tokenizer + try: + args.model_type = args.model_type.lower() + model_class, tokenizer_class = MODEL_CLASSES[args.model_type] + except KeyError as ke: + raise ke( + "the model {} you specified is not supported. You are welcome to add it and open a PR :)" + ) + tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) model.to(args.device) model.eval() - if args.length < 0 and model.config.max_position_embeddings > 0: - args.length = model.config.max_position_embeddings - elif 0 < model.config.max_position_embeddings < args.length: - args.length = model.config.max_position_embeddings # No generation bigger than model size - elif args.length < 0: - args.length = MAX_LENGTH # avoid infinite loop - + args.length = adjust_length_to_model( + args.length, max_sequence_length=model.config.max_position_embeddings + ) logger.info(args) - if args.model_type in ["ctrl"]: - if args.temperature > 0.7: - logger.info('CTRL typically works better with lower temperatures (and lower top_k).') - while True: - xlm_lang = None - # XLM Language usage detailed in the issues #1414 - if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \ - and model.config.use_lang_emb: - if args.xlm_lang: - language = args.xlm_lang - else: - language = None - while language not in tokenizer.lang2id.keys(): - language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ") - xlm_lang = tokenizer.lang2id[language] + prompt_text = args.prompt if args.prompt else input("Model prompt >>> ") - # XLM masked-language modeling (MLM) models need masked token (see details in sample_sequence) - is_xlm_mlm = args.model_type in ["xlm"] and 'mlm' in args.model_name_or_path - if is_xlm_mlm: - xlm_mask_token = tokenizer.mask_token_id - else: - xlm_mask_token = None + # Different models need different input formatting and/or extra arguments + requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys() + model_kwargs = {} + if requires_preprocessing: + prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) + prompt_text, model_kwargs = prepare_input(args, model, tokenizer, prompt_text) + encoded_prompt = torch.tensor(tokenizer.encode(prompt_text, add_special_tokens=False)).unsqueeze(0) - raw_text = args.prompt if args.prompt else input("Model prompt >>> ") - if args.model_type in ["transfo-xl", "xlnet"]: - # Models with memory likes to have a long prompt for short inputs. - raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text - context_tokens = tokenizer.encode(raw_text, add_special_tokens=False) - if args.model_type == "ctrl": - if not any(context_tokens[0] == x for x in tokenizer.control_codes.values()): - logger.info("WARNING! You are not starting your generation from a control code so you won't get good results") - out = sample_sequence( - model=model, - context=context_tokens, - num_samples=args.num_samples, - length=args.length, - temperature=args.temperature, - top_k=args.top_k, - top_p=args.top_p, - repetition_penalty=args.repetition_penalty, - is_xlnet=bool(args.model_type == "xlnet"), - is_xlm_mlm=is_xlm_mlm, - xlm_mask_token=xlm_mask_token, - xlm_lang=xlm_lang, - device=args.device, - ) - out = out[:, len(context_tokens):].tolist() - for o in out: - text = tokenizer.decode(o, clean_up_tokenization_spaces=True) - text = text[: text.find(args.stop_token) if args.stop_token else None] + output_sequences = model.decode( + prompt_ids=encoded_prompt, + length=args.length, + temperature=args.temperature, + k=args.k, + p=args.p, + repetition_penalty=args.repetition_penalty, + device=args.device, + **model_kwargs, + ) - print(text) + generated_sequence = output_sequences.tolist()[ + encoded_prompt.size(1) : + ] # adapted to case where num_samples > 1 + text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) + text = text[: text.find(args.stop_token) if args.stop_token else None] + + print(text) - if args.prompt: - break return text -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index a884abd0a2..3d8c812c2f 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -18,11 +18,14 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging import os +import warnings import torch from torch import nn +from tqdm import trange from .modeling_auto import AutoModel, AutoModelWithLMHead +from .modeling_utils import Sampler logger = logging.getLogger(__name__) @@ -117,8 +120,7 @@ class PreTrainedEncoderDecoder(nn.Module): kwargs_common = { argument: value for argument, value in kwargs.items() - if not argument.startswith("encoder_") - and not argument.startswith("decoder_") + if not argument.startswith("encoder_") and not argument.startswith("decoder_") } kwargs_decoder = kwargs_common.copy() kwargs_encoder = kwargs_common.copy() @@ -186,51 +188,151 @@ class PreTrainedEncoderDecoder(nn.Module): Indices of decoder input sequence tokens in the vocabulary. kwargs: (`optional`) Remaining dictionary of keyword arguments. """ - # keyword arguments come in 3 flavors: encoder-specific (prefixed by - # `encoder_`), decoder-specific (prefixed by `decoder_`) and those - # that apply to the model as whole. - # We let the specific kwargs override the common ones in case of conflict. + kwargs_encoder, kwargs_decoder = self.prepare_model_kwargs(**kwargs) + + # Encode if needed (training, first prediction pass) + encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) + if encoder_hidden_states is None: + encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) + encoder_hidden_states = encoder_outputs[0] + else: + encoder_outputs = () + + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + decoder_outputs = self.decoder(decoder_input_ids, encoder_hidden_states, **kwargs_decoder) + + return decoder_outputs + encoder_outputs + + def decode( + self, + encoder_input_ids, + decoder_prompt_ids=None, + device=torch.device("cpu"), + length=10, + do_sample=False, + temperature=1.0, + k=9, + p=0., + repetition_penalty=1., + **kwargs + ): + """ Generic sequence generator for encoder-decoder models. + + For encoder-decoders the generation consists in: + - Performing a forward pass through the encoder once; + - Pass the encoder's hidden states to a decoding mechanism that + repeatedly calls the decoder to generate sequences. + + The method currently supports greedy decoding and sampling. See the + documentation of the `Sampler` class for more information about the + parameters related to sampling. + + Params: + **encoder_input_ids**: `torch.LongTensor` of shape (1, sequence_length) + The sequence to encode. + **decoder_prompt_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape (1,) + **device**: (`optional`) `torch.device` + The device on which the prompt_ids will be initialized if not provided. + **length**: (`optional`) int + The length of the sequence to be generated. + **do_sample**: (`optional`) bool + If set to `False` we use greedy decoding; otherwise sampling. + **temperature**: (`optional`) float + The value used to module the next token probabilities. + **k**: (`optional`) int + The parameter used for k-filtering. + **p**: (`optional`) float + The parameter for nucleus sampling. Must be between 0 and 1. + **repetition_penalty**: (`optional`) float + The parameter for repetition penalty. + """ + if decoder_prompt_ids is None: + decoder_prompt_ids = torch.tensor([[]], dtype=torch.long, device=device) + + # When the model does not have a LM head `get_output_embeddings` + # returns `None`. We use this mechanism to determine whether we + # should proceed with decoding or not. + if self.decoder.get_output_embeddings() is None: + raise AttributeError("You tried do generated sequences with a decoder that does not have a LM Head.") + + # The followings checks that the decoder is on the same device as the one + # that is specified. It only works for models that fit on one GPU. + decoder_device = next(self.decoder.parameters()).device + if decoder_device != decoder_prompt_ids.device: + warnings.warn( + "The decoder is not on the same device as the prompt. Expected {}, got {}.".format( + decoder_prompt_ids.device, decoder_device + ) + ) + + kwargs_encoder, kwargs_decoder = self.prepare_model_kwargs(**kwargs) + with torch.no_grad(): + encoder_outputs = self.encoder(encoder_input_ids, **kwargs) + encoder_hidden_states = encoder_outputs[0] + kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states + + sampler_config = { + "k": k, + "p": p, + "do_sample": do_sample, + "temperature": temperature, + "repetition_penalty": repetition_penalty, + } + return self._greedy_decode_or_sample( + decoder_prompt_ids, length, sampler_config, **kwargs_decoder + ) + + def _greedy_decode_or_sample(self, prompt_ids, length, sampler_config, **kwargs_decoder): + sampler = Sampler(**sampler_config) + with torch.no_grad(): + generated_sequence = prompt_ids + for _ in trange(length): + arguments = self.decoder._prepare_inputs_for_decoding(generated_sequence, **kwargs_decoder) + outputs = self.decoder(**arguments) + next_tokens_logits = outputs[0][:, -1, :] + next_tokens = sampler.get_one_token(next_tokens_logits, generated_sequence) + generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) + + return generated_sequence.squeeze(0) + + @staticmethod + def prepare_model_kwargs(**kwargs): + """ Prepare the encoder and decoder's keyword arguments. + + Keyword arguments come in 3 flavors: + - encoder-specific (prefixed by `encoder_`) + - decoder-specific (prefixed by `decoder_`) + - those that apply to the model as whole. + + We let the specific kwargs override the common ones in case of + conflict. + """ kwargs_common = { argument: value for argument, value in kwargs.items() - if not argument.startswith("encoder_") - and not argument.startswith("decoder_") + if not argument.startswith("encoder_") and not argument.startswith("decoder_") } - kwargs_decoder = kwargs_common.copy() - kwargs_encoder = kwargs_common.copy() - kwargs_encoder.update( + decoder_kwargs = kwargs_common.copy() + encoder_kwargs = kwargs_common.copy() + encoder_kwargs.update( { argument[len("encoder_") :]: value for argument, value in kwargs.items() if argument.startswith("encoder_") } ) - kwargs_decoder.update( + decoder_kwargs.update( { argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_") } ) + decoder_kwargs["encoder_attention_mask"] = encoder_kwargs.get("attention_mask", None) - # Encode if needed (training, first prediction pass) - encoder_hidden_states = kwargs_encoder.pop("hidden_states", None) - if encoder_hidden_states is None: - encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder) - encoder_hidden_states = encoder_outputs[ - 0 - ] # output the last layer hidden state - else: - encoder_outputs = () - - # Decode - kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states - kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get( - "attention_mask", None - ) - decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder) - - return decoder_outputs + encoder_outputs + return encoder_kwargs, decoder_kwargs class Model2Model(PreTrainedEncoderDecoder): diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index a6a82f0dfe..473d07f733 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -36,7 +36,7 @@ from torch.nn.parameter import Parameter from .modeling_utils import PreTrainedModel, Conv1D, prune_conv1d_layer, SequenceSummary from .configuration_transfo_xl import TransfoXLConfig -from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits +from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax, sample_logits, LogUniformSampler from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) @@ -908,3 +908,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): outputs = [softmax_output, None] + outputs return outputs # (loss), logits or None if labels is not None (speed up adaptive softmax), new_mems, (all hidden states), (all attentions) + + def get_output_embeddings(self): + """ Double-check if you are using adaptive softmax. + """ + if self.sample_softmax > 0: + return self.out_layer + else: + return self.crit.out_layers[-1] diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 398172a88c..74038351fd 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -23,12 +23,14 @@ import json import logging import os from io import open +import warnings import six import torch from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F +from tqdm import trange from .configuration_utils import PretrainedConfig from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME @@ -87,6 +89,93 @@ class PreTrainedModel(nn.Module): def base_model(self): return getattr(self, self.base_model_prefix, self) + def decode(self, + prompt_ids=None, + device=torch.device('cpu'), + length=10, + do_sample=False, + temperature=1., + k=9, + p=0, + repetition_penalty=1, + **model_kwargs): + """ Generic sequence generator for single-stack models with a LM head. + + The method currently supports greedy decoding and sampling. See the + documentation of the `Sampler` class for more information about the + parameters related to sampling. + + Params: + **encoder_input_ids**: `torch.LongTensor` of shape (1, sequence_length) + The sequence to encode. + **decoder_prompt_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape (1,) + **device**: (`optional`) `torch.device` + The device on which the prompt_ids will be initialized if not provided. + **length**: (`optional`) int + The length of the sequence to be generated. + **do_sample**: (`optional`) bool + If set to `False` we use greedy decoding; otherwise sampling. + **temperature**: (`optional`) float + The value used to module the next token probabilities. + **k**: (`optional`) int + The parameter used for k-filtering. + **p**: (`optional`) float + The parameter for nucleus sampling. Must be between 0 and 1. + **repetition_penalty**: (`optional`) float + The parameter for repetition penalty. + """ + + if prompt_ids is None: + prompt_ids = torch.tensor([[]], dtype=torch.long, device=device) + + # When the model does not have a LM head `get_output_embeddings` + # returns `None`. We use this mechanism to determine whether we + # should proceed with decoding or not. + if self.get_output_embeddings() is None: + raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") + + # The followings checks that the model is on the same device as the one + # that is specified. It only works for models that fit on one GPU. + model_device = next(self.parameters()).device + if model_device != prompt_ids.device: + warnings.warn( + "The model is not on the same device as the prompts. Expected {}, got {}.".format( + prompt_ids.device, model_device + ) + ) + + sampler_config = { + "k": k, + "p": p, + "do_sample": do_sample, + "temperature": temperature, + "repetition_penalty": repetition_penalty, + } + return self._greedy_decode_or_sample(prompt_ids, length, sampler_config, **model_kwargs) + + def _greedy_decode_or_sample(self, prompt_ids, length, sampler_config, **model_kwargs): + """ Generate text using greedy decoding or by sampling tokens.""" + sampler = Sampler(**sampler_config) + generated_sequence = prompt_ids + with torch.no_grad(): + for _ in trange(length): + arguments = self._prepare_inputs_for_decoding(generated_sequence, **model_kwargs) + outputs = self(**arguments) + next_tokens_logits = outputs[0][:, -1, :] + next_tokens = sampler.get_one_token( + next_tokens_logits, generated_sequence + ) + generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) + + return generated_sequence.squeeze(0) + + def _prepare_inputs_for_decoding(self, input_ids, **kwargs): + arguments = {"input_ids": input_ids} + arguments.update(kwargs) + return arguments + def get_input_embeddings(self): """ Get model's input embeddings """ @@ -859,3 +948,143 @@ def prune_layer(layer, index, dim=None): return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) else: raise ValueError("Can't prune layer of class {}".format(layer.__class__)) + + +class Sampler(object): + r""" Sampler is used to generate sequences of ids from logit inputs. + + Greedy decoding, which consists in chosing the most probable token at each + step, is the default behaviour. Sampling with varying temperature, top_k + and nucleus filtering is also implemented. + + Attributes: + **device**: ``torch.device`` + Device on which the computations will be run. + **do_sample**: bool + Whether to sample or do greedy decoding. + **k**: int between 0 and vocab_size + Parameter for the top-k filtering + **p**: float between 0 and 1 + Parameter for the nucleus filtering + **temperature**: strictly positive float + Parameter used to modulate the distribution over ids. Low temperatures + put more emphasis on highly probably token while high temperatures tend + to smooth the probability distribution. + **repetition_penalty**: strictly postitive float + The penalty applied to repeating ids + """ + + def __init__( + self, do_sample=False, k=9, p=0.0, temperature=1.0, repetition_penalty=1.0 + ): + self.k = k + self.p = p + self.do_sample = do_sample + self.temperature = temperature + self.repetition_penalty = repetition_penalty + + self.do_apply_repetition_penalty = True if repetition_penalty > 1 else False + + if self.p > 1: + warnings.warn( + """You are trying to apply nucleus filtering with a value of p greater than 1 ({}). + However p is a probability and its value must lie between 0 and 1. In effect, no filtering + will be applied. If this is not the behavior you expect, change the value of p.""".format( + self.p + ) + ) + + def get_one_token(self, next_token_logits, past_sequence): + logits = self.apply_repetition_penalty(next_token_logits, past_sequence) + if self.do_sample: + logits = self.apply_temperature(logits) + logits = self.apply_top_k_filter(logits) + logits = self.apply_nucleus_filter(logits) + return torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) + return torch.argmax(logits, dim=-1).unsqueeze(-1) + + def apply_repetition_penalty(self, logits, past_sequence): + """ Apply a penalty to tokens that appear more than once in the + generated sequence. + + .. Keskar, Nitish Shirish, et al. "Ctrl: A conditional transformer + language model for controllable generation." arXiv preprint + arXiv:1909.05858 (2019). + """ + if self.do_apply_repetition_penalty: + generated_token_idx = set(past_sequence[0].tolist()) + for token_idx in generated_token_idx: + logits[0, token_idx] /= self.repetition_penalty + return logits + + def apply_temperature(self, logits): + """ Shape the tokens' distribution through temperature. The higher the value + of the temperature, the more skewed towards high probability events the + distribution is. + + .. Goodfellow, Ian, Yoshua Bengio, and Aaron Courville. Deep learning. + MIT press, 2016. + """ + # when dividing a float by 0, torch returns inf which in turns breaks the + # multinomial with an error message that is not very helpful. It is better + # for the user to break the execution and explain why. + if self.temperature == 0: + raise ZeroDivisionError( + """You are trying to sample with a temperature equal to 0. + If you wanted to do greedy sampling, set instead `do_sample` to False. + Otherwise set the temperature to a value different from 0.""" + ) + return logits / self.temperature + + def apply_top_k_filter(self, logits): + """ Use the probability distribution of the tokens to determine the set + to be sampled from. Specifically we select the set of size k such that + the sum of its items' probabilities is maximum. + + .. Fan, Angela, Mike Lewis, and Yann Dauphin. "Hierarchical neural + story generation." arXiv preprint arXiv:1805.04833 (2018). + """ + if self.k > 0: + vocabulary_size = logits.size(-1) + if self.k > vocabulary_size: + warnings.warn( + """You provided a value for k ({}) that is larger than the vocabulary size ({}). + We adjusted k's value to the vocabulary size; if that was what you intended to do + we recommend setting k to 0 instead. It this is not the behavior you expected, + choose a value of k that is smaller than the vocabulary size.""".format( + self.k, vocabulary_size + ) + ) + self.k = vocabulary_size + + indices_to_remove = logits < torch.topk(logits, self.k)[0][..., -1, None] + logits[indices_to_remove] = -float("Inf") + + return logits + + def apply_nucleus_filter(self, logits): + """ Use the probability distribution of the tokens to determine the set + to be sampled from. Specifically, choose the smallest set such that the + sum of its items' probabilities is greater than a number p in [0,1]. + + .. Holtzman, Ari, et al. "The curious case of neural text + degeneration." arXiv preprint arXiv:1904.09751 (2019). + """ + if self.p > 0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + sorted_probabilities = F.softmax(sorted_logits, dim=-1) + cumulative_probabilities = torch.cumsum(sorted_probabilities, dim=-1) + + # Remove tokens with cumulative probability above the threshold, + # but keep the first token above the threshold. + sorted_indices_to_remove = cumulative_probabilities > self.p + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + dim=-1, index=sorted_indices, src=sorted_indices_to_remove + ) + logits[indices_to_remove] = -float("Inf") + + return logits diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 257f0da394..295fff7943 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -646,7 +646,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, - lengths=lengths, + lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds) @@ -657,6 +657,33 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): return outputs + def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): + mask_token = model_kwargs.pop("mask_token", None) + language = model_kwargs.pop("language", None) + input_ids = self._append_mask_token(input_ids, mask_token) + langs = self._create_language_embeddings(input_ids, language) + arguments = {"input_ids": input_ids, "langs": langs} + arguments.update(model_kwargs) + + return arguments + + @staticmethod + def _append_mask_token(sequence, mask_token_id): + """ Append a [MASK] token at the end of the sequence that the MLM model + is going to try to predict. + """ + if mask_token_id is not None: + tokens_to_append = torch.full((1, 1), mask_token_id, dtype=torch.long) + return torch.cat((sequence, tokens_to_append), dim=1) + + return sequence + + @staticmethod + def _create_language_embeddings(sequence, language): + if language is not None: + return torch.tensor([language] * sequence.shape[1]).view(1, -1) + return None + @add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index 225e5b059b..2153923dd2 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -972,6 +972,40 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): return outputs # return (loss), logits, (mems), (hidden states), (attentions) + def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): + input_ids = self._add_dummy_token(input_ids) + perm_mask = self._create_perm_mask(input_ids) + target_mapping = self._create_target_mapping(input_ids) + arguments = { + "input_ids": input_ids, + "perm_mask": perm_mask, + "target_mapping": target_mapping, + } + return arguments + + @staticmethod + def _add_dummy_token(sequence): + dummy = torch.zeros((sequence.size(0), 1), dtype=torch.long) + return torch.cat((sequence, dummy), dim=1) + + @staticmethod + def _create_perm_mask(sequence): + mask = torch.zeros( + (sequence.shape[0], sequence.shape[1], sequence.shape[1]), + dtype=torch.float, + ) + mask[:, :, -1] = 1.0 # Previous tokens don't see last token + return mask + + @staticmethod + def _create_target_mapping(sequence): + target_mapping = torch.zeros( + (sequence.shape[0], 1, sequence.shape[1]), + dtype=torch.float, + ) + target_mapping[0, 0, -1] = 1.0 # predict last token + return target_mapping + @add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, diff --git a/transformers/tests/sampling_test.py b/transformers/tests/sampling_test.py new file mode 100644 index 0000000000..98cc23bf2b --- /dev/null +++ b/transformers/tests/sampling_test.py @@ -0,0 +1,213 @@ +# coding=utf-8 +import sys +import unittest + +import numpy as np +import pytest + +from transformers import is_torch_available + +if is_torch_available(): + import torch + + from transformers import ( + BertConfig, + BertModel, + GPT2Config, + GPT2LMHeadModel, + OpenAIGPTConfig, + OpenAIGPTLMHeadModel, + TransfoXLConfig, + TransfoXLLMHeadModel, + XLMConfig, + XLMWithLMHeadModel, + XLNetConfig, + XLNetLMHeadModel, + Model2Model, + ) + from transformers.modeling_utils import Sampler +else: + pytestmark = pytest.mark.skip("Require Torch") + + +class SamplerTest(unittest.TestCase): + def test_nucleus_sampling(self): + inf = -float("Inf") + test_cases = ( + { + "p": 0, + "logits": torch.tensor([0.3, 0.1, 0.2]), + "expected": torch.tensor([0.3, 0.1, 0.2]), + }, + { + "p": 0.01, + "logits": torch.tensor([0.3, 0.1, 0.2]), + "expected": torch.tensor([0.3, inf, inf]), + }, + { + "p": 1, + "logits": torch.tensor([0.3, 0.1, 0.2]), + "expected": torch.tensor([0.3, 0.1, 0.2]), + }, + { + "p": 0.2, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, inf, inf]), + }, + { + "p": 0.71, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, inf, 0.2]), + }, + { + "p": 0.71, + "logits": torch.tensor([0.1, 0.7, 0.2]), + "expected": torch.tensor([inf, 0.7, 0.2]), + }, + { + "p": 0.71, + "logits": torch.tensor([0.7, 0.2, 0.1]), + "expected": torch.tensor([0.7, 0.2, inf]), + }, + { + "p": 0.91, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, 0.1, 0.2]), + }, + ) + for case in test_cases: + config = { + "do_sample": True, + "temperature": 1.0, + "k": 0, + "p": case["p"], + "repetition_penalty": 1.0, + } + sampler = Sampler(**config) + filtered_logits = sampler.apply_nucleus_filter(case["logits"]) + np.testing.assert_array_equal(case["expected"].numpy(), filtered_logits.numpy()) + + def test_top_k_filter(self): + inf = -float("Inf") + test_cases = ( + { + "k": 0, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, 0.1, 0.2]), + }, + { + "k": 1, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, inf, inf]), + }, + { + "k": 2, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, inf, 0.2]), + }, + { + "k": 3, + "logits": torch.tensor([0.7, 0.1, 0.2]), + "expected": torch.tensor([0.7, 0.1, 0.2]), + }, + ) + for case in test_cases: + config = { + "do_sample": True, + "temperature": 1.0, + "k": case["k"], + "p": 0, + "repetition_penalty": 1.0, + } + sampler = Sampler(**config) + filtered_logits = sampler.apply_top_k_filter(case["logits"]) + np.testing.assert_array_equal(case["expected"].numpy(), filtered_logits.numpy()) + + @pytest.mark.skipif(sys.version_info < (3, 2), reason="assertWarns() requires Python >= 3.2") + def test_wrong_k_value(self): + case = {"k": 10, "vocab_size": 5} + config = { + "do_sample": True, + "temperature": 1.0, + "k": case["k"], + "p": 0, + "repetition_penalty": 1.0, + } + sampler = Sampler(**config) + next_token_logits = torch.rand(case["vocab_size"]).unsqueeze(0) + past_sequence = torch.tensor([]) + with self.assertWarns(UserWarning): + _ = sampler.get_one_token(next_token_logits, past_sequence) + + def test_zero_temperature(self): + temperature = 0 + config = { + "do_sample": True, + "temperature": temperature, + "k": 0, + "p": 0, + "repetition_penalty": 1.0, + } + sampler = Sampler(**config) + next_token_logits = torch.rand(10).unsqueeze(0) + past_sequence = torch.tensor([]) + with self.assertRaises(ZeroDivisionError): + _ = sampler.get_one_token(next_token_logits, past_sequence) + + +class SamplerSingleStackTest(unittest.TestCase): + def test_raises_exception_when_no_LM_head(self): + models = [BertModel(BertConfig())] + for model in models: + with self.assertRaises(AttributeError): + model.decode() + + @pytest.mark.slow + def test_forward_pass_and_output_length(self): + models = { + "XLNet": XLNetLMHeadModel(XLNetConfig()), + "XLM": XLMWithLMHeadModel(XLMConfig()), + "TransfoXL": TransfoXLLMHeadModel(TransfoXLConfig()), + "GPT2": GPT2LMHeadModel(GPT2Config()), + "GPT": OpenAIGPTLMHeadModel(OpenAIGPTConfig()), + } + kwargs = { + "XLNet": {}, + "XLM": {"mask_token": 0}, + "TransfoXL": {}, + "GPT2": {}, + "GPT": {}, + } + prompt = torch.tensor([[1, 2, 3]], dtype=torch.long) + generated_length = 5 + expected_length = 8 + + for name, model in models.items(): + kwargs_model = kwargs[name] + output = model.decode(prompt_ids=prompt, length=generated_length, **kwargs_model) + self.assertEqual(len(output), expected_length) + + +class SamplerEncoderDecoderTest(unittest.TestCase): + @pytest.mark.slow + def test_forward_pass_and_output_length(self): + model = Model2Model.from_pretrained("bert-base-uncased") + + encoder_input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long) + prompt = torch.tensor([[1, 2, 3]], dtype=torch.long) + generated_length = 5 + expected_length = 8 + + output = model.decode( + encoder_input_ids, + decoder_prompt_ids=prompt, + k=2, + p=0.5, + repetition_penalty=2, + length=generated_length, + ) + self.assertEqual(len(output), expected_length) + + +if __name__ == "__main__": + unittest.main() From 67a8be8e90a7fbd5e0bceff9f29fb89ccabb61be Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 10 Dec 2019 17:50:32 +0100 Subject: [PATCH 085/302] fix backward in tests --- transformers/tests/modeling_common_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index ed6f950e25..cd4cf247a6 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -277,8 +277,7 @@ class CommonTestCases: inputs = inputs_dict.copy() inputs['head_mask'] = head_mask - with torch.no_grad(): - outputs = model(**inputs) + outputs = model(**inputs) # Test that we can get a gradient back for importance score computation output = sum(t.sum() for t in outputs[0]) From dc4e9e5cb36ae9bf5185b49b1cbc9106857abd54 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 10 Dec 2019 19:21:20 +0000 Subject: [PATCH 086/302] DataParallel for SQuAD + fix XLM --- examples/run_squad.py | 6 +++++- transformers/data/metrics/squad_metrics.py | 7 ++++++- transformers/tokenization_xlm.py | 4 ++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 2df29014ef..5e3f9663e2 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -299,10 +299,14 @@ def evaluate(args, model, tokenizer, prefix=""): # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: + + start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top + end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top + predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size, args.max_answer_length, output_prediction_file, output_nbest_file, output_null_log_odds_file, - model.config.start_n_top, model.config.end_n_top, + start_n_top, end_n_top, args.version_2_with_negative, tokenizer, args.verbose_logging) else: predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 0755c0ab7a..7b03255f49 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -695,7 +695,12 @@ def compute_predictions_log_probs( tok_text = " ".join(tok_text.split()) orig_text = " ".join(orig_tokens) - final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case, + if hasattr(tokenizer, "do_lower_case"): + do_lower_case = tokenizer.do_lower_case + else: + do_lower_case = tokenizer.do_lowercase_and_remove_accent + + final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging) if final_text in seen_predictions: diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py index 6c9f8e5e5c..8def80bec4 100644 --- a/transformers/tokenization_xlm.py +++ b/transformers/tokenization_xlm.py @@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer): additional_special_tokens=additional_special_tokens, **kwargs) + + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + # cache of sm.MosesPunctNormalizer instance self.cache_moses_punct_normalizer = dict() # cache of sm.MosesTokenizer instance From 6a73382706ce3c6905023872f63a680f0eb419a4 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Tue, 10 Dec 2019 14:33:24 -0500 Subject: [PATCH 087/302] Complete warning + cleanup --- examples/run_squad.py | 1 - transformers/tokenization_utils.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 5e3f9663e2..79c8537a4b 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -299,7 +299,6 @@ def evaluate(args, model, tokenizer, prefix=""): # XLNet and XLM use a more complex post-processing procedure if args.model_type in ['xlnet', 'xlm']: - start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index f4395cd82c..cb931b0eaf 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -226,7 +226,7 @@ class PreTrainedTokenizer(object): self.max_len = max_len if max_len is not None else int(1e12) - # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed. + # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed. self.padding_side = kwargs.pop('padding_side', self.padding_side) # Added tokens @@ -1003,7 +1003,7 @@ class PreTrainedTokenizer(object): ) if pad_to_max_length and max_length is None and self.max_len > 10000: - logger.warning("Sequence can't be padded as the maximum ") + logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.") if needs_to_be_padded: difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"]) From 58d75aa310e872723ba92ee1f0cb575ae9e2eaef Mon Sep 17 00:00:00 2001 From: Leo Dirac Date: Tue, 10 Dec 2019 11:36:56 -0800 Subject: [PATCH 088/302] Progress indicator improvements when downloading pre-trained models. --- transformers/file_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 24abd60781..68de4e6e2f 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -21,7 +21,7 @@ import boto3 from botocore.config import Config from botocore.exceptions import ClientError import requests -from tqdm import tqdm +from tqdm.auto import tqdm from contextlib import contextmanager logger = logging.getLogger(__name__) # pylint: disable=invalid-name @@ -245,7 +245,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0): return content_length = response.headers.get('Content-Length') total = resume_size + int(content_length) if content_length is not None else None - progress = tqdm(unit="B", total=total, initial=resume_size) + progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading") for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) From 9a24e0cf767601858f13808d37b5b71787b7641e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 11 Dec 2019 00:33:25 +0100 Subject: [PATCH 089/302] Refactored qa pipeline argument handling + unittests --- transformers/pipelines.py | 87 ++++++++++++++++++---------- transformers/tests/pipelines_test.py | 33 ++++++++++- 2 files changed, 87 insertions(+), 33 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index eec4932321..da8b0b65a7 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -98,14 +98,11 @@ class TextClassificationPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline): """ Question Answering pipeling involving Tokenization and Inference. - TODO: - - top-k answers - - return start/end chars - - return score """ - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): - super().__init__(model, tokenizer) + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + pass @staticmethod def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: @@ -116,6 +113,55 @@ class QuestionAnsweringPipeline(Pipeline): else: return SquadExample(None, question, context, None, None, None) + @staticmethod + def handle_args(*inputs, **kwargs) -> List[SquadExample]: + # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating + if inputs is not None and len(inputs) > 1: + kwargs['X'] = inputs + + # Generic compatibility with sklearn and Keras + # Batched data + if 'X' in kwargs or 'data' in kwargs: + data = kwargs['X'] if 'X' in kwargs else kwargs['data'] + + if not isinstance(data, list): + data = [data] + + for i, item in enumerate(data): + if isinstance(item, dict): + if any(k not in item for k in ['question', 'context']): + raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') + data[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif isinstance(item, SquadExample): + continue + else: + raise ValueError( + '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' + .format('X' if 'X' in kwargs else 'data') + ) + inputs = data + + # Tabular input + elif 'question' in kwargs and 'context' in kwargs: + if isinstance(kwargs['question'], str): + kwargs['question'] = [kwargs['question']] + + if isinstance(kwargs['context'], str): + kwargs['context'] = [kwargs['context']] + + inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] + else: + raise ValueError('Unknown arguments {}'.format(kwargs)) + + if not isinstance(inputs, list): + inputs = [inputs] + + return inputs + + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): + super().__init__(model, tokenizer) + def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: args = ['input_ids', 'attention_mask'] model_type = type(self.model).__name__.lower() @@ -131,10 +177,6 @@ class QuestionAnsweringPipeline(Pipeline): else: return {k: [feature.__dict__[k] for feature in features] for k in args} - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - pass - def __call__(self, *texts, **kwargs): # Set defaults values kwargs.setdefault('topk', 1) @@ -149,29 +191,10 @@ class QuestionAnsweringPipeline(Pipeline): if kwargs['max_answer_len'] < 1: raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) - # Position args - if texts is not None and len(texts) > 1: - (texts, ) = texts - - # Generic compatibility with sklearn and Keras - elif 'X' in kwargs and not texts: - texts = kwargs.pop('X') - - # Batched data - elif 'data' in kwargs: - texts = kwargs.pop('data') - - # Tabular input - elif 'question' in kwargs and 'context' in kwargs: - texts = QuestionAnsweringPipeline.create_sample(kwargs['question'], kwargs['context']) - else: - raise ValueError('Unknown arguments {}'.format(kwargs)) - - if not isinstance(texts, list): - texts = [texts] + examples = QuestionAnsweringPipeline.handle_args(texts, **kwargs) # Convert inputs to features - features = squad_convert_examples_to_features(texts, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) + features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) fw_args = self.inputs_for_model(features) if is_tf_available(): @@ -188,7 +211,7 @@ class QuestionAnsweringPipeline(Pipeline): start, end = start.cpu().numpy(), end.cpu().numpy() answers = [] - for (example, feature, start_, end_) in zip(texts, features, start, end): + for (example, feature, start_, end_) in zip(examples, features, start, end): # Normalize logits and spans to retrieve the answer start_ = np.exp(start_) / np.sum(np.exp(start_)) end_ = np.exp(end_) / np.sum(np.exp(end_)) diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py index 36d6e013c3..ee10234269 100644 --- a/transformers/tests/pipelines_test.py +++ b/transformers/tests/pipelines_test.py @@ -40,7 +40,38 @@ class QuestionAnsweringPipelineTest(unittest.TestCase): # Batch case with topk = 2 a = nlp(question=['What is the name of the company I\'m working for ?', 'Where is the company based ?'], - context=['I\'m working for Huggingface.', 'The company is based in New York and Paris'], topk=2) + context=['Where is the company based ?', 'The company is based in New York and Paris'], topk=2) + self.check_answer_structure(a, 2, 2) + + # check for data keyword + a = nlp(data=nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.')) + self.check_answer_structure(a, 1, 1) + + a = nlp(data=nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.'), topk=2) + self.check_answer_structure(a, 1, 2) + + a = nlp(data=[ + nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.'), + nlp.create_sample(question='I\'m working for Huggingface.', context='The company is based in New York and Paris'), + ]) + self.check_answer_structure(a, 2, 1) + + a = nlp(data=[ + {'question': 'What is the name of the company I\'m working for ?', 'context': 'I\'m working for Huggingface.'}, + {'question': 'Where is the company based ?', 'context': 'The company is based in New York and Paris'}, + ]) + self.check_answer_structure(a, 2, 1) + + # X keywords + a = nlp(X=nlp.create_sample( + question='Where is the company based ?', context='The company is based in New York and Paris' + )) + self.check_answer_structure(a, 1, 1) + + a = nlp(X=[ + {'question': 'What is the name of the company I\'m working for ?', 'context': 'I\'m working for Huggingface.'}, + {'question': 'Where is the company based ?', 'context': 'The company is based in New York and Paris'}, + ], topk=2) self.check_answer_structure(a, 2, 2) @patch('transformers.pipelines.is_torch_available', return_value=False) From fafd4c86ecb63bb90b095bbd23453553e33fe99d Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 11 Dec 2019 13:47:27 +0100 Subject: [PATCH 090/302] fix TF 2.0 version of T5 - update conversion script --- .../convert_pytorch_checkpoint_to_tf2.py | 11 ++--- transformers/file_utils.py | 3 ++ transformers/modeling_t5.py | 21 +++++++-- transformers/modeling_tf_t5.py | 43 ++++++++++++------- transformers/modeling_tf_utils.py | 6 +-- transformers/modeling_utils.py | 12 +++++- 6 files changed, 65 insertions(+), 31 deletions(-) diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index 76d75b43e4..4a9832f123 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -120,24 +120,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path) if compare_with_pt_model: - inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] - tf_inputs = tf_model.dummy_inputs - tfo = tf_model(tf_inputs, training=False) # build the network + tfo = tf_model(tf_model.dummy_inputs, training=False) # build the network state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu') pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None, config=config, state_dict=state_dict) - pt_inputs = torch.tensor(inputs_list) with torch.no_grad(): - pto = pt_model(pt_inputs) + pto = pt_model(**pt_model.dummy_inputs) - np_pt = pto[0].detach().numpy() + np_pt = pto[0].numpy() np_tf = tfo[0].numpy() diff = np.amax(np.abs(np_pt - np_tf)) print("Max absolute difference between models outputs {}".format(diff)) - assert diff <= 2e-2, "Error, model absolute difference is >2e-2" + assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff) # Save pytorch-model print("Save TensorFlow model to {}".format(tf_dump_path)) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 24abd60781..e36bbf4eeb 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -73,6 +73,9 @@ TF2_WEIGHTS_NAME = 'tf_model.h5' TF_WEIGHTS_NAME = 'model.ckpt' CONFIG_NAME = "config.json" +DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] +DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] + def is_torch_available(): return _torch_available diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index ffc4d8bb3f..149b977abc 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -32,7 +32,7 @@ from torch.nn import CrossEntropyLoss, MSELoss from .modeling_utils import PreTrainedModel from .configuration_t5 import T5Config -from .file_utils import add_start_docstrings +from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK logger = logging.getLogger(__name__) @@ -451,6 +451,15 @@ class T5PreTrainedModel(PreTrainedModel): load_tf_weights = load_tf_weights_in_t5 base_model_prefix = "transformer" + @property + def dummy_inputs(self): + input_ids = torch.tensor(DUMMY_INPUTS) + input_mask = torch.tensor(DUMMY_MASK) + dummy_inputs = {'decoder_input_ids': input_ids, + 'encoder_input_ids': input_ids, + 'decoder_attention_mask': input_mask} + return dummy_inputs + def _init_weights(self, module): """ Initialize the weights """ factor = self.config.initializer_factor # Used for testing weights initialization @@ -534,9 +543,10 @@ class T5Stack(T5PreTrainedModel): # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - # T5 has a mask that can compare sequence ids, we simulate this here with this transposistion + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 - extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2)) + # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2)) + extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 @@ -548,6 +558,10 @@ class T5Stack(T5PreTrainedModel): if encoder_attention_mask.dim() == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: @@ -590,6 +604,7 @@ class T5Stack(T5PreTrainedModel): hidden_states = layer_outputs[0] if i == 0: # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[2 if self.output_attentions else 1] if self.is_decoder: encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2] diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 0b3b1116f2..fd25328ac6 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -26,7 +26,7 @@ import tensorflow as tf from .configuration_t5 import T5Config from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list -from .file_utils import add_start_docstrings +from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK logger = logging.getLogger(__name__) @@ -61,7 +61,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer): super(TFT5LayerNorm, self).build(input_shape) def call(self, x): - variance = tf.math.reduce_min(tf.math.square(x), axis=-1, keepdims=True) + variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True) x = x * tf.math.rsqrt(variance + self.variance_epsilon) return self.weight * x @@ -231,19 +231,19 @@ class TFT5Attention(tf.keras.layers.Layer): cache[self.layer_id] = (k, v) # q = q / math.sqrt(dim_per_head) # No scaling in T5 - scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + # scores = tf.matmul(q, k, transpose_b=True) # (bs, n_heads, qlen, klen) + scores = tf.einsum('bnqd,bnkd->bnqk', q, k) # (bs, n_heads, qlen, klen) if position_bias is None: if not self.has_relative_attention_bias: raise ValueError("No position_bias provided and no weights to compute position_bias") position_bias = self.compute_bias(qlen, klen) + if mask is not None: + position_bias = position_bias + mask + # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) + # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) + scores += position_bias - - if mask is not None: - scores += mask - # mask = (mask == 0).expand_as(scores) # (bs, n_heads, qlen, klen) - # scores.masked_fill_(mask, -float('inf')) # (bs, n_heads, qlen, klen) - weights = tf.nn.softmax(scores, axis=-1) # (bs, n_heads, qlen, klen) weights = self.dropout(weights, training=training) # (bs, n_heads, qlen, klen) @@ -350,11 +350,11 @@ class TFT5Block(tf.keras.layers.Layer): head_mask=head_mask, training=training) hidden_states = cross_attention_outputs[0] - outputs = cross_attention_outputs[1:] + outputs + outputs = outputs + cross_attention_outputs[1:] hidden_states = self.layer[2](hidden_states, training=training) outputs = (hidden_states,) + outputs # add attentions if we output them - return outputs + return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) #################################################### @@ -418,7 +418,13 @@ class TFT5MainLayer(tf.keras.layers.Layer): # positions we want to attend and -10000.0 for masked positions. # Since we are adding it to the raw scores before the softmax, this is # effectively the same as removing these entirely. - extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # extended_attention_mask = tf.math.equal(extended_attention_mask, + # tf.transpose(extended_attention_mask, perm=(-1, -2))) + + extended_attention_mask = (1.0 - extended_attention_mask) * -1e9 if self.is_decoder: # If a 2D ou 3D attention mask is provided for the cross-attention @@ -430,7 +436,12 @@ class TFT5MainLayer(tf.keras.layers.Layer): if num_dims_encoder_attention_mask == 2: encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] - encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9 else: encoder_extended_attention_mask = None @@ -463,6 +474,8 @@ class TFT5MainLayer(tf.keras.layers.Layer): training=training) hidden_states = layer_outputs[0] if i == 0: + # We share the position biases between the layers - the first layer store them + # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias) position_bias = layer_outputs[2 if self.output_attentions else 1] if self.is_decoder: encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2] @@ -502,8 +515,8 @@ class TFT5PreTrainedModel(TFPreTrainedModel): @property def dummy_inputs(self): - input_ids = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) - input_mask = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + input_ids = tf.constant(DUMMY_INPUTS) + input_mask = tf.constant(DUMMY_MASK) dummy_inputs = {'decoder_input_ids': input_ids, 'encoder_input_ids': input_ids, 'decoder_attention_mask': input_mask} diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index ed8fdb74c9..8d010e589e 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -24,13 +24,11 @@ import os import tensorflow as tf from .configuration_utils import PretrainedConfig -from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME +from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME, DUMMY_INPUTS from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model logger = logging.getLogger(__name__) -DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] - class TFPreTrainedModel(tf.keras.Model): r""" Base class for all TF models. @@ -59,7 +57,7 @@ class TFPreTrainedModel(tf.keras.Model): Returns: tf.Tensor with dummy inputs """ - return tf.constant(DUMMY_INPUTS) + return {'input_ids': tf.constant(DUMMY_INPUTS)} def __init__(self, config, *inputs, **kwargs): super(TFPreTrainedModel, self).__init__(*inputs, **kwargs) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index aa0e0e6191..ae515d6870 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -31,11 +31,10 @@ from torch.nn import CrossEntropyLoss from torch.nn import functional as F from .configuration_utils import PretrainedConfig -from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME +from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME, DUMMY_INPUTS logger = logging.getLogger(__name__) - try: from torch.nn import Identity except ImportError: @@ -71,6 +70,15 @@ class PreTrainedModel(nn.Module): load_tf_weights = lambda model, config, path: None base_model_prefix = "" + @property + def dummy_inputs(self): + """ Dummy inputs to do a forward pass in the network. + + Returns: + torch.Tensor with dummy inputs + """ + return {'input_ids': torch.tensor(DUMMY_INPUTS)} + def __init__(self, config, *inputs, **kwargs): super(PreTrainedModel, self).__init__() if not isinstance(config, PretrainedConfig): From b040bff6df09923870f44fb5402e895d57327e85 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 11 Dec 2019 14:13:58 +0100 Subject: [PATCH 091/302] Added supported model to AutoModelTokenClassification --- transformers/__init__.py | 4 +- transformers/modeling_auto.py | 123 ++++++++++++++++++++++++++++++- transformers/modeling_tf_auto.py | 111 +++++++++++++++++++++++++++- 3 files changed, 231 insertions(+), 7 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 4300409257..c474696062 100755 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -69,7 +69,7 @@ from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFI if is_torch_available(): from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, - AutoModelWithLMHead) + AutoModelWithLMHead, AutoModelForTokenClassification) from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, @@ -124,7 +124,7 @@ if is_torch_available(): if is_tf_available(): from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, - TFAutoModelWithLMHead) + TFAutoModelWithLMHead, TFAutoModelForTokenClassification) from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, TFBertModel, TFBertForPreTraining, diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 041115cc61..cb877643ab 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -22,16 +22,19 @@ from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRL DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, TransfoXLConfig, XLMConfig, XLNetConfig) -from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering +from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \ + BertForTokenClassification from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel from .modeling_ctrl import CTRLModel, CTRLLMHeadModel from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel -from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering +from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \ + XLNetForTokenClassification from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice +from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \ + CamembertForMultipleChoice, CamembertForTokenClassification from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering from .modeling_utils import PreTrainedModel, SequenceSummary @@ -699,3 +702,117 @@ class AutoModelForQuestionAnswering(object): raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'xlnet', 'xlm', 'distilbert', 'albert'".format(pretrained_model_name_or_path)) + + +class AutoModelForTokenClassification: + def __init__(self): + raise EnvironmentError("AutoModelForTokenClassification is designed to be instantiated " + "using the `AutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForTokenClassification.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `xlm` configuration class: XLMModel (XLM model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = AutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, CamembertConfig): + return CamembertForTokenClassification(config) + elif isinstance(config, BertConfig): + return BertForTokenClassification(config) + elif isinstance(config, XLNetConfig): + return XLNetForTokenClassification(config) + raise ValueError("Unrecognized configuration class {}".format(config)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the question answering model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `distilbert`: DistilBertForTokenClassification (DistilBERT model) + - contains `albert`: AlbertForTokenClassification (ALBERT model) + - contains `bert`: BertForTokenClassification (Bert model) + - contains `xlnet`: XLNetForTokenClassification (XLNet model) + - contains `xlm`: XLMForTokenClassification (XLM model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = AutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = AutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'camembert' in pretrained_model_name_or_path: + return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'bert' in pretrained_model_name_or_path: + return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet', 'camembert'".format(pretrained_model_name_or_path)) diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index e78b91cfcc..1097f77a59 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -22,11 +22,13 @@ from .configuration_auto import (BertConfig, CTRLConfig, DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, TransfoXLConfig, XLMConfig, XLNetConfig) -from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering +from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, \ + TFBertForQuestionAnswering, TFBertForTokenClassification from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel -from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple +from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \ + TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification @@ -668,3 +670,108 @@ class TFAutoModelForQuestionAnswering(object): raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'distilbert', 'bert', 'xlnet', 'xlm'".format(pretrained_model_name_or_path)) + + +class TFAutoModelForTokenClassification: + def __init__(self): + raise EnvironmentError("TFAutoModelForTokenClassification is designed to be instantiated " + "using the `TFAutoModelForTokenClassification.from_pretrained(pretrained_model_name_or_path)` or " + "`AutoModelForTokenClassification.from_config(config)` methods.") + + @classmethod + def from_config(cls, config): + r""" Instantiates one of the base model classes of the library + from a configuration. + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + The model class to instantiate is selected based on the configuration class: + - isInstance of `bert` configuration class: BertModel (Bert model) + - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + + Examples:: + + config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. + model = TFAutoModelForTokenClassification.from_config(config) # E.g. model was saved using `save_pretrained('./test/saved_model/')` + """ + if isinstance(config, BertConfig): + return TFBertForTokenClassification(config) + elif isinstance(config, XLNetConfig): + return TFXLNetForTokenClassification(config) + raise ValueError("Unrecognized configuration class {}".format(config)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + r""" Instantiates one of the question answering model classes of the library + from a pre-trained model configuration. + + The `from_pretrained()` method takes care of returning the correct model class instance + using pattern matching on the `pretrained_model_name_or_path` string. + + The model class to instantiate is selected as the first pattern matching + in the `pretrained_model_name_or_path` string (in the following order): + - contains `bert`: BertForTokenClassification (Bert model) + - contains `xlnet`: XLNetForTokenClassification (XLNet model) + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) + To train the model, you should first set it back in training mode with `model.train()` + + Params: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. + - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + + model_args: (`optional`) Sequence of positional arguments: + All remaning positional arguments will be passed to the underlying model's ``__init__`` method + + config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: + + - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or + - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory. + - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory. + + state_dict: (`optional`) dict: + an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file. + This option can be used if you want to create a model from a pretrained configuration but load your own weights. + In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + configuration should be cached if the standard cache should not be used. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model weights and configuration files and override the cached versions if they exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + output_loading_info: (`optional`) boolean: + Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages. + + kwargs: (`optional`) Remaining dictionary of keyword arguments: + Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded: + + - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done) + - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function. + + Examples:: + + model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased') # Download model and configuration from S3 and cache. + model = TFAutoModelForTokenClassification.from_pretrained('./test/bert_model/') # E.g. model was saved using `save_pretrained('./test/saved_model/')` + model = TFAutoModelForTokenClassification.from_pretrained('bert-base-uncased', output_attention=True) # Update configuration during loading + assert model.config.output_attention == True + # Loading from a TF checkpoint file instead of a PyTorch model (slower) + config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json') + model = TFAutoModelForTokenClassification.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config) + + """ + if 'bert' in pretrained_model_name_or_path: + return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlnet' in pretrained_model_name_or_path: + return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + raise ValueError("Unrecognized model identifier in {}. Should contains one of " + "'bert', 'xlnet'".format(pretrained_model_name_or_path)) From 4c12860f7ae61659aed2675498350a386fc4e122 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 11 Dec 2019 09:22:37 -0500 Subject: [PATCH 092/302] Remove misleading documentation --- transformers/tokenization_utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index cb931b0eaf..68a767fe82 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -628,7 +628,6 @@ class PreTrainedTokenizer(object): Take care of added tokens. text: The sequence to be encoded. - return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False). **kwargs: passed to the child `self.tokenize()` method """ def lowercase_text(t): From c28273793ec41c98153b23e29b9c7228c4149aae Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 11 Dec 2019 14:52:01 +0100 Subject: [PATCH 093/302] Add missing DistilBert and Roberta to AutoModelForTokenClassification --- transformers/modeling_auto.py | 25 ++++++++++++++++++------- transformers/modeling_tf_auto.py | 18 +++++++++++++++--- 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index cb877643ab..c76e5b78b3 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -31,8 +31,10 @@ from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, \ XLNetForTokenClassification from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification -from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, \ + RobertaForTokenClassification +from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, \ + DistilBertForSequenceClassification, DistilBertForTokenClassification from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, \ CamembertForMultipleChoice, CamembertForTokenClassification from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering @@ -720,8 +722,9 @@ class AutoModelForTokenClassification: - isInstance of `distilbert` configuration class: DistilBertModel (DistilBERT model) - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) - - isInstance of `xlm` configuration class: XLMModel (XLM model) - + - isInstance of `camembert` configuration class: CamembertModel (Camembert model) + - isInstance of `roberta` configuration class: RobertaModel (Roberta model) + Examples:: config = BertConfig.from_pretrained('bert-base-uncased') # Download configuration from S3 and cache. @@ -729,10 +732,14 @@ class AutoModelForTokenClassification: """ if isinstance(config, CamembertConfig): return CamembertForTokenClassification(config) + elif isinstance(config, DistilBertConfig): + return DistilBertForTokenClassification(config) elif isinstance(config, BertConfig): return BertForTokenClassification(config) elif isinstance(config, XLNetConfig): return XLNetForTokenClassification(config) + elif isinstance(config, RobertaConfig): + return RobertaForTokenClassification(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -746,10 +753,10 @@ class AutoModelForTokenClassification: The model class to instantiate is selected as the first pattern matching in the `pretrained_model_name_or_path` string (in the following order): - contains `distilbert`: DistilBertForTokenClassification (DistilBERT model) - - contains `albert`: AlbertForTokenClassification (ALBERT model) + - contains `camembert`: CamembertForTokenClassification (Camembert model) - contains `bert`: BertForTokenClassification (Bert model) - contains `xlnet`: XLNetForTokenClassification (XLNet model) - - contains `xlm`: XLMForTokenClassification (XLM model) + - contains `roberta`: RobertaForTokenClassification (Roberta model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` @@ -809,10 +816,14 @@ class AutoModelForTokenClassification: """ if 'camembert' in pretrained_model_name_or_path: return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: + return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: return BertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'xlnet' in pretrained_model_name_or_path: return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'camembert'".format(pretrained_model_name_or_path)) + "'bert', 'xlnet', 'camembert', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)) diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index 1097f77a59..add7e03341 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -30,8 +30,8 @@ from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, \ TFXLNetForQuestionAnsweringSimple, TFXLNetForTokenClassification from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple -from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification -from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification +from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TFRobertaForTokenClassification +from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TFDistilBertForTokenClassification from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel from .file_utils import add_start_docstrings @@ -687,6 +687,8 @@ class TFAutoModelForTokenClassification: The model class to instantiate is selected based on the configuration class: - isInstance of `bert` configuration class: BertModel (Bert model) - isInstance of `xlnet` configuration class: XLNetModel (XLNet model) + - isInstance of `distilbert` configuration class: DistilBertModel (DistilBert model) + - isInstance of `roberta` configuration class: RobteraModel (Roberta model) Examples:: @@ -697,6 +699,10 @@ class TFAutoModelForTokenClassification: return TFBertForTokenClassification(config) elif isinstance(config, XLNetConfig): return TFXLNetForTokenClassification(config) + elif isinstance(config, DistilBertConfig): + return TFDistilBertForTokenClassification(config) + elif isinstance(config, RobertaConfig): + return TFRobertaForTokenClassification(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -711,6 +717,8 @@ class TFAutoModelForTokenClassification: in the `pretrained_model_name_or_path` string (in the following order): - contains `bert`: BertForTokenClassification (Bert model) - contains `xlnet`: XLNetForTokenClassification (XLNet model) + - contains `distilbert`: DistilBertForTokenClassification (DistilBert model) + - contains `roberta`: RobertaForTokenClassification (Roberta model) The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated) To train the model, you should first set it back in training mode with `model.train()` @@ -772,6 +780,10 @@ class TFAutoModelForTokenClassification: return TFBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'xlnet' in pretrained_model_name_or_path: return TFXLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'distilbert' in pretrained_model_name_or_path: + return TFDistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'roberta' in pretrained_model_name_or_path: + return TFRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet'".format(pretrained_model_name_or_path)) + "'bert', 'xlnet', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)) From 2e2f9fed554bb5f147ea3d9573004b447dd7c9e7 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 11:11:56 -0500 Subject: [PATCH 094/302] rm duplicate imports --- transformers/modeling_auto.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index b63e43d73b..6ba1aab7a3 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -28,7 +28,6 @@ from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassifica from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering from .modeling_utils import PreTrainedModel, SequenceSummary From 29570db25ba9dd30e5ac9be68dbcad95434964ec Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 11 Dec 2019 17:19:18 +0100 Subject: [PATCH 095/302] allowing from_pretrained to load from url directly --- transformers/modeling_tf_utils.py | 4 +++- transformers/modeling_utils.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index ed8fdb74c9..e7512b5bd6 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -259,8 +259,10 @@ class TFPreTrainedModel(tf.keras.Model): pretrained_model_name_or_path)) elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + archive_file = pretrained_model_name_or_path + ".index" else: - raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path)) + archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 3ac568771e..9e7ca8d689 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -365,9 +365,12 @@ class PreTrainedModel(nn.Module): pretrained_model_name_or_path)) elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path - else: - assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path) + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( + pretrained_model_name_or_path + ".index") archive_file = pretrained_model_name_or_path + ".index" + else: + archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: From 6709739a05ca8b271a629ffebb497352449b7935 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 11 Dec 2019 17:19:18 +0100 Subject: [PATCH 096/302] allowing from_pretrained to load from url directly --- transformers/modeling_tf_utils.py | 4 +++- transformers/modeling_utils.py | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 6c48f3eed2..95c29693d8 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -265,8 +265,10 @@ class TFPreTrainedModel(tf.keras.Model): pretrained_model_name_or_path)) elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + archive_file = pretrained_model_name_or_path + ".index" else: - raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path)) + archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 398172a88c..eec9034fd7 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -364,9 +364,12 @@ class PreTrainedModel(nn.Module): pretrained_model_name_or_path)) elif os.path.isfile(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path - else: - assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path) + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( + pretrained_model_name_or_path + ".index") archive_file = pretrained_model_name_or_path + ".index" + else: + archive_file = pretrained_model_name_or_path # redirect to the cache, if necessary try: From 030faccb8d45be9bdd2b4b80ff26f36dc41f622a Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 11 Dec 2019 17:44:21 +0100 Subject: [PATCH 097/302] doc: fix pretrained models table --- docs/source/pretrained_models.rst | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index dd61f11769..2fe1f8a314 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -169,35 +169,35 @@ Here is the full list of the currently provided pretrained models together with +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | ALBERT | ``albert-base-v1`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters | | | | | ALBERT base model | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-large-v1`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters | | | | | ALBERT large model | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-xlarge-v1`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters | | | | | ALBERT xlarge model | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-xxlarge-v1`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters | | | | | ALBERT xxlarge model | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-base-v2`` | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters | | | | | ALBERT base model with no dropout, additional training data and longer training | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-large-v2`` | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters | | | | | ALBERT large model with no dropout, additional training data and longer training | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-xlarge-v2`` | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters | | | | | ALBERT xlarge model with no dropout, additional training data and longer training | -| | | (see `details `__) | +| | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``albert-xxlarge-v2`` | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters | | | | | ALBERT xxlarge model with no dropout, additional training data and longer training | -| | | (see `details `__) | +| | | (see `details `__) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ From c999a3e5050f1dc93d814abf352f3bf0c06572e7 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 12:29:58 -0500 Subject: [PATCH 098/302] Allow from_pretrained to take a remote identifier --- transformers/configuration_utils.py | 8 +++++--- transformers/file_utils.py | 20 ++++++++++++++++---- transformers/modeling_utils.py | 8 +++++--- transformers/tokenization_utils.py | 10 +++++----- 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 08cee75d81..8ae30f2a48 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -24,7 +24,7 @@ import logging import os from io import open -from .file_utils import cached_path, CONFIG_NAME +from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url logger = logging.getLogger(__name__) @@ -131,8 +131,10 @@ class PretrainedConfig(object): config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path] elif os.path.isdir(pretrained_model_name_or_path): config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) - else: + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): config_file = pretrained_model_name_or_path + else: + config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME) # redirect to the cache, if necessary try: resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, @@ -187,7 +189,7 @@ class PretrainedConfig(object): @classmethod def from_json_file(cls, json_file): - """Constructs a `BertConfig` from a json file of parameters.""" + """Constructs a `Config` from a json file of parameters.""" with open(json_file, "r", encoding='utf-8') as reader: text = reader.read() return cls.from_dict(json.loads(text)) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 68de4e6e2f..5fd5e2ee39 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -73,6 +73,8 @@ TF2_WEIGHTS_NAME = 'tf_model.h5' TF_WEIGHTS_NAME = 'model.ckpt' CONFIG_NAME = "config.json" +S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" + def is_torch_available(): return _torch_available @@ -103,6 +105,18 @@ else: return fn return docstring_decorator + +def is_remote_url(url_or_filename): + parsed = urlparse(url_or_filename) + return parsed.scheme in ('http', 'https', 's3') + +def hf_bucket_url(identifier, postfix=None): + if postfix is None: + return "/".join((S3_BUCKET_PREFIX, identifier)) + else: + return "/".join((S3_BUCKET_PREFIX, identifier, postfix)) + + def url_to_filename(url, etag=None): """ Convert `url` into a hashed filename in a repeatable way. @@ -171,9 +185,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N if sys.version_info[0] == 3 and isinstance(cache_dir, Path): cache_dir = str(cache_dir) - parsed = urlparse(url_or_filename) - - if parsed.scheme in ('http', 'https', 's3'): + if is_remote_url(url_or_filename): # URL, so get it from the cache (downloading if necessary) return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, @@ -181,7 +193,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename - elif parsed.scheme == '': + elif urlparse(url_or_filename).scheme == '': # File, but it doesn't exist. raise EnvironmentError("file {} not found".format(url_or_filename)) else: diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 9e7ca8d689..eac4252336 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss from torch.nn import functional as F from .configuration_utils import PretrainedConfig -from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME +from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, + cached_path, hf_bucket_url, is_remote_url) logger = logging.getLogger(__name__) @@ -363,14 +364,15 @@ class PreTrainedModel(nn.Module): raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"], pretrained_model_name_or_path)) - elif os.path.isfile(pretrained_model_name_or_path): + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format( pretrained_model_name_or_path + ".index") archive_file = pretrained_model_name_or_path + ".index" else: - archive_file = pretrained_model_name_or_path + archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME) + # todo do we want to support TF checkpoints here? # redirect to the cache, if necessary try: diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 68a767fe82..2b2cec0c15 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -25,7 +25,7 @@ import itertools import re from io import open -from .file_utils import cached_path, is_tf_available, is_torch_available +from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available if is_tf_available(): import tensorflow as tf @@ -327,12 +327,12 @@ class PreTrainedTokenizer(object): if os.path.isdir(pretrained_model_name_or_path): # If a directory is provided we look for the standard filenames full_file_name = os.path.join(pretrained_model_name_or_path, file_name) - else: + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) full_file_name = pretrained_model_name_or_path - if not os.path.exists(full_file_name): - logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) - full_file_name = None + else: + full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name) + vocab_files[file_id] = full_file_name # Look for the additional tokens files From 3d57c51111054adb01b2ea94bfd45237eb282431 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 11 Dec 2019 15:10:17 -0500 Subject: [PATCH 099/302] Fix encode plus --- transformers/tokenization_utils.py | 39 ++++++++++++++++++------------ 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 68a767fe82..eace409555 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -916,7 +916,7 @@ class PreTrainedTokenizer(object): return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True). - return_attention_mask: (optional) Set to False to avoir returning attention mask (default True) + return_attention_mask: (optional) Set to False to avoid returning attention mask (default True) return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False). return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False). @@ -961,24 +961,13 @@ class PreTrainedTokenizer(object): if add_special_tokens: sequence = self.build_inputs_with_special_tokens(ids, pair_ids) token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) - special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids) else: sequence = ids + pair_ids if pair else ids token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) - special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0)) + if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) - # Prepare inputs as tensors if asked - if return_tensors == 'tf' and is_tf_available(): - sequence = tf.constant([sequence]) - token_type_ids = tf.constant([token_type_ids]) - elif return_tensors == 'pt' and is_torch_available(): - sequence = torch.tensor([sequence]) - token_type_ids = torch.tensor([token_type_ids]) - elif return_tensors is not None: - logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) - encoded_inputs["input_ids"] = sequence if return_token_type_ids: encoded_inputs["token_type_ids"] = token_type_ids @@ -1015,10 +1004,9 @@ class PreTrainedTokenizer(object): if return_special_tokens_mask: encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference - elif self.padding_side == 'left': if return_attention_mask: - encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) + encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"]) if return_token_type_ids: encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"] if return_special_tokens_mask: @@ -1030,7 +1018,26 @@ class PreTrainedTokenizer(object): elif return_attention_mask: encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"]) - + + # Prepare inputs as tensors if asked + if return_tensors == 'tf' and is_tf_available(): + encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]]) + encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]]) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]]) + + elif return_tensors == 'pt' and is_torch_available(): + encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]]) + encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]]) + + if "attention_mask" in encoded_inputs: + encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]]) + elif return_tensors is not None: + logger.warning( + "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format( + return_tensors)) + return encoded_inputs def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0): From 31e5b5ff2276c61af7eebb4c353934f8f675d728 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 15:22:02 -0500 Subject: [PATCH 100/302] Fix tests + first example of doc --- transformers/tokenization_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 2b2cec0c15..63d2cc5cb4 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -255,6 +255,7 @@ class PreTrainedTokenizer(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. @@ -282,6 +283,9 @@ class PreTrainedTokenizer(object): # Download vocabulary from S3 and cache. tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased') + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') @@ -327,6 +331,9 @@ class PreTrainedTokenizer(object): if os.path.isdir(pretrained_model_name_or_path): # If a directory is provided we look for the standard filenames full_file_name = os.path.join(pretrained_model_name_or_path, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) full_file_name = pretrained_model_name_or_path From 18e1f751f1d996c4fe01559ade1cd013186b81e4 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 17:07:46 -0500 Subject: [PATCH 101/302] TF support --- transformers/modeling_tf_utils.py | 9 ++++++--- transformers/modeling_utils.py | 3 ++- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index e7512b5bd6..4a6d18f447 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -24,7 +24,8 @@ import os import tensorflow as tf from .configuration_utils import PretrainedConfig -from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME +from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME, + cached_path, hf_bucket_url, is_remote_url) from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model logger = logging.getLogger(__name__) @@ -257,12 +258,14 @@ class TFPreTrainedModel(tf.keras.Model): raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format( [WEIGHTS_NAME, TF2_WEIGHTS_NAME], pretrained_model_name_or_path)) - elif os.path.isfile(pretrained_model_name_or_path): + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): archive_file = pretrained_model_name_or_path elif os.path.isfile(pretrained_model_name_or_path + ".index"): archive_file = pretrained_model_name_or_path + ".index" else: - archive_file = pretrained_model_name_or_path + archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME) + if from_pt: + raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.") # redirect to the cache, if necessary try: diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index eac4252336..37088f8e67 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -372,7 +372,8 @@ class PreTrainedModel(nn.Module): archive_file = pretrained_model_name_or_path + ".index" else: archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME) - # todo do we want to support TF checkpoints here? + if from_tf: + raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.") # redirect to the cache, if necessary try: From 4f15e5a267201f86bdd9628cf58592d0e1cc86eb Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 17:41:51 -0500 Subject: [PATCH 102/302] Add tests. Maybe not the best possible place for the tests, lmk. --- transformers/tests/modeling_auto_test.py | 7 ++++++- transformers/tests/modeling_tf_auto_test.py | 7 ++++++- transformers/tests/tokenization_auto_test.py | 7 ++++++- transformers/tests/utils.py | 3 +++ 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py index 9b7d920bc8..871a262fe8 100644 --- a/transformers/tests/modeling_auto_test.py +++ b/transformers/tests/modeling_auto_test.py @@ -22,7 +22,7 @@ import logging from transformers import is_torch_available -from .utils import require_torch, slow +from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER if is_torch_available(): from transformers import (AutoConfig, BertConfig, @@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase): self.assertIsNotNone(model) self.assertIsInstance(model, BertForQuestionAnswering) + def test_from_pretrained_identifier(self): + logging.basicConfig(level=logging.INFO) + model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) + self.assertIsInstance(model, BertForMaskedLM) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py index 7ea48015d9..7ab6eaa3d6 100644 --- a/transformers/tests/modeling_tf_auto_test.py +++ b/transformers/tests/modeling_tf_auto_test.py @@ -22,7 +22,7 @@ import logging from transformers import is_tf_available -from .utils import require_tf, slow +from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER if is_tf_available(): from transformers import (AutoConfig, BertConfig, @@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase): self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForQuestionAnswering) + def test_from_pretrained_identifier(self): + logging.basicConfig(level=logging.INFO) + model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True) + self.assertIsInstance(model, TFBertForMaskedLM) + if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py index 18346d2768..0a894cac04 100644 --- a/transformers/tests/tokenization_auto_test.py +++ b/transformers/tests/tokenization_auto_test.py @@ -23,7 +23,7 @@ import logging from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP -from .utils import slow +from .utils import slow, SMALL_MODEL_IDENTIFIER class AutoTokenizerTest(unittest.TestCase): @@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase): self.assertIsInstance(tokenizer, GPT2Tokenizer) self.assertGreater(len(tokenizer), 0) + def test_tokenizer_from_pretrained_identifier(self): + logging.basicConfig(level=logging.INFO) + tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER) + self.assertIsInstance(tokenizer, BertTokenizer) + self.assertEqual(len(tokenizer), 12) if __name__ == "__main__": unittest.main() diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py index 7a51ab612b..3aff1daf83 100644 --- a/transformers/tests/utils.py +++ b/transformers/tests/utils.py @@ -6,6 +6,9 @@ from distutils.util import strtobool from transformers.file_utils import _tf_available, _torch_available +SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" + + try: run_slow = os.environ["RUN_SLOW"] except KeyError: From c03c0dfd230a5174c536a58d6ba5e590ed1afcc4 Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Fri, 15 Nov 2019 17:24:56 +0900 Subject: [PATCH 103/302] Add support for Japanese BERT models by cl-tohoku --- docs/source/pretrained_models.rst | 18 ++ transformers/__init__.py | 1 + transformers/configuration_bert.py | 4 + transformers/modeling_bert.py | 8 +- transformers/modeling_tf_bert.py | 16 +- transformers/tokenization_auto.py | 3 + transformers/tokenization_bert_japanese.py | 247 +++++++++++++++++++++ 7 files changed, 289 insertions(+), 8 deletions(-) create mode 100644 transformers/tokenization_bert_japanese.py diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 2fe1f8a314..d3498e057d 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with | | ``bert-base-german-dbmdz-uncased`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | Trained on uncased German text by DBMDZ | | | | (see `details on dbmdz repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | OpenAI GPT English model | diff --git a/transformers/__init__.py b/transformers/__init__.py index f9a28add5f..5d7b0b772c 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -37,6 +37,7 @@ if is_sklearn_available(): from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_auto import AutoTokenizer from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer +from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) from .tokenization_gpt2 import GPT2Tokenizer diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index d63be963eb..16f1f60404 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json" } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index d84b0a1a7c..e2e115a015 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin" } @@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel): question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]" input_ids = tokenizer.encode(input_text) - token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] + token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])) - all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) # a nice puppet diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 5aa7bb3da2..27dd311a4d 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5" } @@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): linear tensor, float32 with shape [batch_size, length, vocab_size]. Raises: ValueError: if mode is not valid. - + Shared weights logic adapted from https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 """ @@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer): input_shape = shape_list(input_ids) else: input_shape = shape_list(inputs_embeds)[:-1] - + seq_length = input_shape[1] if position_ids is None: position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :] @@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer): context_layer = tf.matmul(attention_probs, value_layer) context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) - context_layer = tf.reshape(context_layer, + context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size)) # (batch_size, seq_len_q, all_head_size) outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,) @@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r""" The BERT model was proposed in `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})` Parameters: - config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. + config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. """ @@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r""" (a) For sequence pairs: ``tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]`` - + ``token_type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` (b) For single sequences: ``tokens: [CLS] the dog is hairy . [SEP]`` - + ``token_type_ids: 0 0 0 0 0 0 0`` Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index b7c5046961..d63b7e783d 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging from .tokenization_bert import BertTokenizer +from .tokenization_bert_japanese import BertJapaneseTokenizer from .tokenization_openai import OpenAIGPTTokenizer from .tokenization_gpt2 import GPT2Tokenizer from .tokenization_ctrl import CTRLTokenizer @@ -118,6 +119,8 @@ class AutoTokenizer(object): return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'bert-japanese' in pretrained_model_name_or_path: + return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'bert' in pretrained_model_name_or_path: return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'openai-gpt' in pretrained_model_name_or_path: diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py new file mode 100644 index 0000000000..8554a1c880 --- /dev/null +++ b/transformers/tokenization_bert_japanese.py @@ -0,0 +1,247 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +from __future__ import absolute_import, division, print_function, unicode_literals + +import collections +import logging +import os +import unicodedata +from io import open + +from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab +from .tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt", + 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt", + 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt", + 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt" + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'bert-base-japanese': 512, + 'bert-base-japanese-whole-word-masking': 512, + 'bert-base-japanese-char': 512, + 'bert-base-japanese-char-whole-word-masking': 512 +} + +PRETRAINED_INIT_CONFIGURATION = { + 'bert-base-japanese': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'wordpiece' + }, + 'bert-base-japanese-whole-word-masking':{ + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'wordpiece' + }, + 'bert-base-japanese-char': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'character' + }, + 'bert-base-japanese-char-whole-word-masking': { + 'do_lower_case': False, + 'word_tokenizer_type': 'mecab', + 'subword_tokenizer_type': 'character' + } +} + + +class BertJapaneseTokenizer(BertTokenizer): + """BERT tokenizer for Japanese text""" + + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, do_lower_case=False, + do_word_tokenize=True, do_subword_tokenize=True, + word_tokenizer_type='basic', subword_tokenizer_type='wordpiece', + never_split=None, unk_token='[UNK]', sep_token='[SEP]', + pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs): + """Constructs a MecabBertTokenizer. + + Args: + **vocab_file**: Path to a one-wordpiece-per-line vocabulary file. + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input. + Only has an effect when do_basic_tokenize=True. + **do_word_tokenize**: (`optional`) boolean (default True) + Whether to do word tokenization. + **do_subword_tokenize**: (`optional`) boolean (default True) + Whether to do subword tokenization. + **word_tokenizer_type**: (`optional`) string (default "basic") + Type of word tokenizer. + **subword_tokenizer_type**: (`optional`) string (default "wordpiece") + Type of subword tokenizer. + """ + super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token, + pad_token=pad_token, cls_token=cls_token, + mask_token=mask_token, **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 3 # take into account special tokens + + if not os.path.isfile(vocab_file): + raise ValueError( + "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained " + "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file)) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict( + [(ids, tok) for tok, ids in self.vocab.items()]) + + self.do_word_tokenize = do_word_tokenize + if do_word_tokenize: + if word_tokenizer_type == 'basic': + self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=False) + elif word_tokenizer_type == 'mecab': + self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case, + never_split=never_split) + else: + raise ValueError( + "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type)) + + self.do_subword_tokenize = do_subword_tokenize + if do_subword_tokenize: + if subword_tokenizer_type == 'wordpiece': + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, + unk_token=self.unk_token) + elif subword_tokenizer_type == 'character': + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, + unk_token=self.unk_token) + else: + raise ValueError( + "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type)) + + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.word_tokenizer.tokenize(text, + never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [sub_token for token in tokens + for sub_token in self.subword_tokenizer.tokenize(token)] + else: + split_tokens = tokens + + return split_tokens + + +class MecabTokenizer(object): + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__(self, do_lower_case=False, never_split=None, normalize_text=True): + """Constructs a MecabTokenizer. + + Args: + **do_lower_case**: (`optional`) boolean (default True) + Whether to lower case the input. + **never_split**: (`optional`) list of str + Kept for backward compatibility purposes. + Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`) + List of token not to split. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + import MeCab + self.mecab = MeCab.Tagger() + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize('NFKC', text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + cursor = 0 + for line in self.mecab.parse(text).split('\n'): + if line == 'EOS': + break + + token, _ = line.split('\t') + token_start = text.index(token, cursor) + token_end = token_start + len(token) + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + cursor = token_end + + return tokens + + +class CharacterTokenizer(object): + """Runs Character tokenziation.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """Constructs a CharacterTokenizer. + + Args: + **vocab**: + Vocabulary object. + **unk_token**: str + A special symbol for out-of-vocabulary token. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """Tokenizes a piece of text into characters. + + For example: + input = "apple" + output = ["a", "p", "p", "l", "e"] + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through `BasicTokenizer`. + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize('NFKC', text) + + output_tokens = [] + for i, char in enumerate(text): + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens From 57b5cb3eaa850a212235fccbd4e5d002aede72b6 Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Wed, 20 Nov 2019 09:02:10 +0900 Subject: [PATCH 104/302] Fix loading BertJapaneseTokenizer --- transformers/tokenization_auto.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index d63b7e783d..f36a584521 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -73,6 +73,7 @@ class AutoTokenizer(object): - contains `albert`: AlbertTokenizer (ALBERT model) - contains `camembert`: CamembertTokenizer (CamemBERT model) - contains `roberta`: RobertaTokenizer (RoBERTa model) + - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model) @@ -119,7 +120,7 @@ class AutoTokenizer(object): return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - elif 'bert-japanese' in pretrained_model_name_or_path: + elif 'bert-base-japanese' in pretrained_model_name_or_path: return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'bert' in pretrained_model_name_or_path: return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) From a09da4eeb0397dd66d61182177dd3b753d70e62a Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Fri, 29 Nov 2019 19:24:43 +0900 Subject: [PATCH 105/302] Add a test for Japanese BERT tokenizers --- .../tests/tokenization_bert_japanese_test.py | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 transformers/tests/tokenization_bert_japanese_test.py diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py new file mode 100644 index 0000000000..6f66b96411 --- /dev/null +++ b/transformers/tests/tokenization_bert_japanese_test.py @@ -0,0 +1,192 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import unittest +import pytest +from io import open + +from transformers.tokenization_bert import WordpieceTokenizer +from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer, + MecabTokenizer, CharacterTokenizer, + VOCAB_FILES_NAMES) + +from .tokenization_tests_commons import CommonTestCases + + +class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): + + tokenizer_class = BertJapaneseTokenizer + + def setUp(self): + super(BertJapaneseTokenizationTest, self).setUp() + + vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", + u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは", + u"世界", u"##世界", u"、", u"##、", u"。", u"##。"] + + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def get_tokenizer(self, **kwargs): + return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + def get_input_output_texts(self): + input_text = u"こんにちは、世界。 \nこんばんは、世界。" + output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file) + + tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。") + self.assertListEqual(tokens, + [u"こんにちは", u"、", u"世界", u"。", + u"こん", u"##ばんは", u"、", u"世界", "。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), + [3, 12, 10, 14, 4, 9, 12, 10, 14]) + + def test_mecab_tokenizer(self): + tokenizer = MecabTokenizer() + + self.assertListEqual( + tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), + [u"アップルストア", u"で", u"iPhone", u"8", u"が", + u"発売", u"さ", u"れ", u"た", u"。"]) + + def test_mecab_tokenizer_lower(self): + tokenizer = MecabTokenizer(do_lower_case=True) + + self.assertListEqual( + tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), + [u"アップルストア", u"で", u"iphone", u"8", u"が", + u"発売", u"さ", u"れ", u"た", u"。"]) + + def test_mecab_tokenizer_no_normalize(self): + tokenizer = MecabTokenizer(normalize_text=False) + + self.assertListEqual( + tokenizer.tokenize(u" \tアップルストアでiPhone8 が \n 発売された 。 "), + [u"アップルストア", u"で", u"iPhone", u"8", u"が", + u"発売", u"さ", u"れ", u"た", u" ", u"。"]) + + def test_wordpiece_tokenizer(self): + vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", + u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]") + + self.assertListEqual(tokenizer.tokenize(u""), []) + + self.assertListEqual(tokenizer.tokenize(u"こんにちは"), + [u"こんにちは"]) + + self.assertListEqual(tokenizer.tokenize(u"こんばんは"), + [u"こん", u"##ばんは"]) + + self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"), + [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"]) + + @pytest.mark.slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese") + + text = tokenizer.encode(u"ありがとう。", add_special_tokens=False) + text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + # 2 is for "[CLS]", 3 is for "[SEP]" + assert encoded_sentence == [2] + text + [3] + assert encoded_pair == [2] + text + [3] + text_2 + [3] + + +class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester): + + tokenizer_class = BertJapaneseTokenizer + + def setUp(self): + super(BertJapaneseCharacterTokenizationTest, self).setUp() + + vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", + u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"] + + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + def get_tokenizer(self, **kwargs): + return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, + subword_tokenizer_type="character", + **kwargs) + + def get_input_output_texts(self): + input_text = u"こんにちは、世界。 \nこんばんは、世界。" + output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。" + return input_text, output_text + + def test_full_tokenizer(self): + tokenizer = self.tokenizer_class(self.vocab_file, + subword_tokenizer_type="character") + + tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。") + self.assertListEqual(tokens, + [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。", + u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"]) + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), + [3, 4, 5, 6, 7, 11, 9, 10, 12, + 3, 4, 8, 4, 7, 11, 9, 10, 12]) + + def test_character_tokenizer(self): + vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]", + u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"] + + vocab = {} + for (i, token) in enumerate(vocab_tokens): + vocab[token] = i + tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]") + + self.assertListEqual(tokenizer.tokenize(u""), []) + + self.assertListEqual(tokenizer.tokenize(u"こんにちは"), + [u"こ", u"ん", u"に", u"ち", u"は"]) + + self.assertListEqual(tokenizer.tokenize(u"こんにちほ"), + [u"こ", u"ん", u"に", u"ち", u"[UNK]"]) + + @pytest.mark.slow + def test_sequence_builders(self): + tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char") + + text = tokenizer.encode(u"ありがとう。", add_special_tokens=False) + text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False) + + encoded_sentence = tokenizer.build_inputs_with_special_tokens(text) + encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2) + + # 2 is for "[CLS]", 3 is for "[SEP]" + assert encoded_sentence == [2] + text + [3] + assert encoded_pair == [2] + text + [3] + text_2 + [3] + + + +if __name__ == '__main__': + unittest.main() From 6a43dc9d7d592362d144209097e1d93876f8e88a Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Thu, 5 Dec 2019 11:19:02 +0900 Subject: [PATCH 106/302] Support Python 2 --- transformers/tokenization_bert_japanese.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py index 8554a1c880..1ce0e1d1cb 100644 --- a/transformers/tokenization_bert_japanese.py +++ b/transformers/tokenization_bert_japanese.py @@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import collections import logging import os +import six import unicodedata from io import open @@ -186,8 +187,13 @@ class MecabTokenizer(object): never_split = self.never_split + (never_split if never_split is not None else []) tokens = [] + if six.PY2: + mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8') + else: + mecab_output = self.mecab.parse(text) + cursor = 0 - for line in self.mecab.parse(text).split('\n'): + for line in mecab_output.split('\n'): if line == 'EOS': break From 597ba7feb384316081c96955196fcb7abb2edf06 Mon Sep 17 00:00:00 2001 From: Masatoshi Suzuki Date: Thu, 5 Dec 2019 11:30:40 +0900 Subject: [PATCH 107/302] Support testing Japanese BERT tokenizers --- .circleci/config.yml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 01e6d82b33..97f5f25606 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,6 +13,8 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig + - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py3_torch: @@ -27,6 +29,8 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig + - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: python -m pytest -sv ./examples/ - run: codecov @@ -42,6 +46,8 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig + - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py2_torch: @@ -55,6 +61,8 @@ jobs: - run: sudo pip install torch - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig + - run: sudo pip install mecab-python - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py2_tf: @@ -68,6 +76,8 @@ jobs: - run: sudo pip install tensorflow - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig + - run: sudo pip install mecab-python - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov deploy_doc: From d2100428d3652cefbffcf0bd00f0881090d26333 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 21:43:49 +0000 Subject: [PATCH 108/302] Update to new test infra and only run conditionally --- .circleci/config.yml | 20 ++++----- .../tests/tokenization_bert_japanese_test.py | 9 ++-- transformers/tests/utils.py | 42 +++++++++++++------ 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 97f5f25606..7ca5f8121c 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,8 +13,6 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py3_torch: @@ -29,8 +27,6 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: python -m pytest -sv ./examples/ - run: codecov @@ -46,8 +42,6 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - run: sudo pip install tensorboardX scikit-learn - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python3 - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py2_torch: @@ -61,8 +55,6 @@ jobs: - run: sudo pip install torch - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov build_py2_tf: @@ -76,10 +68,18 @@ jobs: - run: sudo pip install tensorflow - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov + build_py3_custom_tokenizers: + working_directory: ~/transformers + docker: + - image: circleci/python:3.5 + steps: + - checkout + - run: sudo pip install --progress-bar off . + - run: sudo pip install pytest + - run: sudo pip install mecab-python3 + - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: working_directory: ~/transformers docker: diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py index 6f66b96411..545193c7cc 100644 --- a/transformers/tests/tokenization_bert_japanese_test.py +++ b/transformers/tests/tokenization_bert_japanese_test.py @@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera import os import unittest -import pytest from io import open from transformers.tokenization_bert import WordpieceTokenizer @@ -25,8 +24,10 @@ from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer, VOCAB_FILES_NAMES) from .tokenization_tests_commons import CommonTestCases +from .utils import slow, custom_tokenizers +@custom_tokenizers class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): tokenizer_class = BertJapaneseTokenizer @@ -104,7 +105,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"), [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"]) - @pytest.mark.slow + @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese") @@ -172,7 +173,7 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste self.assertListEqual(tokenizer.tokenize(u"こんにちほ"), [u"こ", u"ん", u"に", u"ち", u"[UNK]"]) - @pytest.mark.slow + @slow def test_sequence_builders(self): tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char") @@ -188,5 +189,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste -if __name__ == '__main__': - unittest.main() diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py index 7a51ab612b..2b97293ca7 100644 --- a/transformers/tests/utils.py +++ b/transformers/tests/utils.py @@ -6,18 +6,23 @@ from distutils.util import strtobool from transformers.file_utils import _tf_available, _torch_available -try: - run_slow = os.environ["RUN_SLOW"] -except KeyError: - # RUN_SLOW isn't set, default to skipping slow tests. - _run_slow_tests = False -else: - # RUN_SLOW is set, convert it to True or False. +def parse_flag_from_env(key, default=False): try: - _run_slow_tests = strtobool(run_slow) - except ValueError: - # More values are supported, but let's keep the message simple. - raise ValueError("If set, RUN_SLOW must be yes or no.") + value = os.environ[key] + except KeyError: + # KEY isn't set, default to `default`. + _value = default + else: + # KEY is set, convert it to True or False. + try: + _value = strtobool(value) + except ValueError: + # More values are supported, but let's keep the message simple. + raise ValueError("If set, {} must be yes or no.".format(key)) + return _value + +_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False) +_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False) def slow(test_case): @@ -33,6 +38,19 @@ def slow(test_case): return test_case +def custom_tokenizers(test_case): + """ + Decorator marking a test for a custom tokenizer. + + Custom tokenizers require additional dependencies, and are skipped + by default. Set the RUN_CUSTOM_TOKENIZERS environment variable + to a truthy value to run them. + """ + if not _run_custom_tokenizers: + test_case = unittest.skip("test of custom tokenizers")(test_case) + return test_case + + def require_torch(test_case): """ Decorator marking a test that requires PyTorch. @@ -59,6 +77,6 @@ def require_tf(test_case): if _torch_available: # Set the USE_CUDA environment variable to select a GPU. - torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu" + torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu" else: torch_device = None From 95854c4a2f8d418a14e64b4edf64fc7363b1ff10 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 21:46:00 +0000 Subject: [PATCH 109/302] Actually run the tests --- .circleci/config.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7ca5f8121c..d8f624a0e5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -79,7 +79,7 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest - run: sudo pip install mecab-python3 - - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py + - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: working_directory: ~/transformers docker: From 9cb97c0c0f7215971bb5a39cd070e5bd89319bdf Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 21:48:56 +0000 Subject: [PATCH 110/302] Actually run the tests --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index d8f624a0e5..9d6e02d580 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -101,6 +101,7 @@ workflows: version: 2 build_and_test: jobs: + - build_py3_custom_tokenizers - build_py3_torch_and_tf - build_py3_torch - build_py3_tf From 5505cf701477762cedf792e20344d29bc8bf6325 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 21:53:44 +0000 Subject: [PATCH 111/302] Run tests on Py2 too, for Lysandre --- .circleci/config.yml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9d6e02d580..afc6d5ce44 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,6 +80,16 @@ jobs: - run: sudo pip install pytest - run: sudo pip install mecab-python3 - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py + build_py2_custom_tokenizers: + working_directory: ~/transformers + docker: + - image: circleci/python:2.7 + steps: + - checkout + - run: sudo pip install --progress-bar off . + - run: sudo pip install pytest + - run: sudo pip install mecab-python + - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: working_directory: ~/transformers docker: @@ -102,6 +112,7 @@ workflows: build_and_test: jobs: - build_py3_custom_tokenizers + - build_py2_custom_tokenizers - build_py3_torch_and_tf - build_py3_torch - build_py3_tf From 371c5ddfad96689771465aff557152322190b60e Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 21:55:43 +0000 Subject: [PATCH 112/302] Py2 tests for Lysandre --- .circleci/config.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index afc6d5ce44..c827a81fbb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -88,6 +88,7 @@ jobs: - checkout - run: sudo pip install --progress-bar off . - run: sudo pip install pytest + - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - run: sudo pip install mecab-python - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: From 36fc52a3b4b50885d5ec3bf259f81740e19d8b3c Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 10 Dec 2019 22:03:35 +0000 Subject: [PATCH 113/302] Update links to weights --- transformers/configuration_bert.py | 8 ++++---- transformers/modeling_bert.py | 8 ++++---- transformers/modeling_tf_bert.py | 8 ++++---- transformers/tokenization_bert_japanese.py | 8 ++++---- 4 files changed, 16 insertions(+), 16 deletions(-) diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index 16f1f60404..01fcd88cb8 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -42,10 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json", - 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json", - 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json", - 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json", - 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json" + 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", + 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", + 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json" } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index e2e115a015..d0f35272ac 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -48,10 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin", - 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin", - 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin", - 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin", - 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin" + 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", + 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", + 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin" } diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 27dd311a4d..7cc71f5063 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -48,10 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5", 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5", - 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5", - 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5", - 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5", - 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5" + 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5", + 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", + 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5" } diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py index 1ce0e1d1cb..0ff45cbfe7 100644 --- a/transformers/tokenization_bert_japanese.py +++ b/transformers/tokenization_bert_japanese.py @@ -33,10 +33,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { - 'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt", - 'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt", - 'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt", - 'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt" + 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt", + 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt", + 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt", + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt" } } From 1748fdf657ed804f3edc1e45077b703cd8d6e4c5 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Wed, 11 Dec 2019 23:31:23 +0000 Subject: [PATCH 114/302] [doc] Fix rst table --- docs/source/pretrained_models.rst | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index d3498e057d..775772e896 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -63,22 +63,22 @@ Here is the full list of the currently provided pretrained models together with | | | (see `details on dbmdz repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``bert-base-japanese`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | -| | | (see `details on cl-tohoku repository `__). | +| | | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``bert-base-japanese-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | -| | | | `MeCab `__ is required for tokenization. | -| | | (see `details on cl-tohoku repository `__). | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece. | +| | | | `MeCab `__ is required for tokenization. | +| | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``bert-base-japanese-char`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text. Text is tokenized into characters. | -| | | (see `details on cl-tohoku repository `__). | +| | | | Trained on Japanese text. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | -| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. | -| | | (see `details on cl-tohoku repository `__). | +| | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. | +| | | (see `details on cl-tohoku repository `__). | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | OpenAI GPT English model | From 413f41921b650418798f7d5c246316c4e1e5eb5d Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 12 Dec 2019 07:34:42 +0100 Subject: [PATCH 115/302] fix merge --- transformers/tests/utils.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py index daed431995..c950ad8f17 100644 --- a/transformers/tests/utils.py +++ b/transformers/tests/utils.py @@ -9,14 +9,6 @@ from transformers.file_utils import _tf_available, _torch_available SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" -try: - run_slow = os.environ["RUN_SLOW"] -except KeyError: - # RUN_SLOW isn't set, default to skipping slow tests. - _run_slow_tests = False -else: - # RUN_SLOW is set, convert it to True or False. - def parse_flag_from_env(key, default=False): try: value = os.environ[key] From f69dbecc38e04cd4d158afb273921ca7b75c7cba Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 12 Dec 2019 10:25:36 +0100 Subject: [PATCH 116/302] Expose classification labels mapping (and reverse) in model config. --- transformers/configuration_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 08cee75d81..97b9fa8f80 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -58,6 +58,8 @@ class PretrainedConfig(object): self.use_bfloat16 = kwargs.pop('use_bfloat16', False) self.pruned_heads = kwargs.pop('pruned_heads', {}) self.is_decoder = kwargs.pop('is_decoder', False) + self.idx2label = kwargs.pop('idx2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) + self.label2idx = kwargs.pop('label2idx', dict(zip(self.idx2label.values(), self.idx2label.keys()))) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it From f19dad61c70a628545612e435c699263f02bc4a0 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 12 Dec 2019 14:46:30 +0100 Subject: [PATCH 117/302] fixing XLM conversion tests with dummy input --- transformers/modeling_tf_pytorch_utils.py | 6 +++++- transformers/modeling_tf_xlm.py | 2 +- transformers/modeling_xlm.py | 12 +++++++++++- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index 510e130c90..9d2b663dcb 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i logger.info("Loading PyTorch weights from {}".format(pt_path)) pt_state_dict = torch.load(pt_path, map_location='cpu') + logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values()))) return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys) @@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a start_prefix_to_remove = tf_model.base_model_prefix + '.' symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights - + tf_loaded_numel = 0 weight_value_tuples = [] all_pytorch_weights = set(list(pt_state_dict.keys())) for symbolic_weight in symbolic_weights: @@ -159,6 +160,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a e.args += (symbolic_weight.shape, array.shape) raise e + tf_loaded_numel += array.size # logger.warning("Initialize TF weight {}".format(symbolic_weight.name)) weight_value_tuples.append((symbolic_weight, array)) @@ -169,6 +171,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a if tf_inputs is not None: tfo = tf_model(tf_inputs, training=False) # Make sure restore ops are run + logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel)) + logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights)) return tf_model diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py index 6f11b0537d..903a8596c3 100644 --- a/transformers/modeling_tf_xlm.py +++ b/transformers/modeling_tf_xlm.py @@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel): langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) else: langs_list = None - return [inputs_list, attns_list, langs_list] + return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list} XLM_START_DOCSTRING = r""" The XLM model was proposed in diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 257f0da394..b604ae669d 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel): def __init__(self, *inputs, **kwargs): super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs) + @property + def dummy_inputs(self): + inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]) + attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + if self.config.use_lang_emb and self.config.n_langs > 1: + langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]]) + else: + langs_list = None + return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list} + def _init_weights(self, module): """ Initialize the weights. """ if isinstance(module, nn.Embedding): @@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): langs=langs, token_type_ids=token_type_ids, position_ids=position_ids, - lengths=lengths, + lengths=lengths, cache=cache, head_mask=head_mask, inputs_embeds=inputs_embeds) From fbf5455a8607fa660aacbf06c16f6fe23758b13d Mon Sep 17 00:00:00 2001 From: Alan deLevie Date: Wed, 11 Dec 2019 10:14:48 -0500 Subject: [PATCH 118/302] Fix typo in examples/run_glue.py args declaration. deay -> decay --- examples/run_glue.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 369a7110ab..1a51255c11 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -380,7 +380,7 @@ def main(): parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") + help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, From fe92755b992eb61239ad361abae3b71f86bbbba1 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 12 Dec 2019 11:37:19 -0500 Subject: [PATCH 119/302] Fix special tokens mask in encode --- transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index f44b77b27c..7e86742286 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -973,7 +973,7 @@ class PreTrainedTokenizer(object): token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else []) if return_special_tokens_mask: - encoded_inputs["special_tokens_mask"] = special_tokens_mask + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) encoded_inputs["input_ids"] = sequence if return_token_type_ids: From 5d67aa21aefaaa62594e8dfb56093b83c5f547bb Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Thu, 12 Dec 2019 12:39:41 -0500 Subject: [PATCH 120/302] [doc] Replicate doc from #2144 --- transformers/configuration_auto.py | 1 + transformers/configuration_utils.py | 1 + transformers/modeling_auto.py | 4 ++++ transformers/modeling_encoder_decoder.py | 2 ++ transformers/modeling_tf_auto.py | 4 ++++ transformers/modeling_tf_utils.py | 1 + transformers/modeling_utils.py | 1 + transformers/tokenization_auto.py | 11 +++++++++-- transformers/tokenization_utils.py | 4 ++-- 9 files changed, 25 insertions(+), 4 deletions(-) diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index 43f251bd0c..fbc5c59199 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -83,6 +83,7 @@ class AutoConfig(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 8ae30f2a48..82959adb57 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -79,6 +79,7 @@ class PretrainedConfig(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``. - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``. diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 6ba1aab7a3..96f45d8ec4 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -93,6 +93,7 @@ class AutoModel(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. @@ -231,6 +232,7 @@ class AutoModelWithLMHead(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. @@ -360,6 +362,7 @@ class AutoModelForSequenceClassification(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. @@ -478,6 +481,7 @@ class AutoModelForQuestionAnswering(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index a884abd0a2..70f765b849 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -59,12 +59,14 @@ class PreTrainedEncoderDecoder(nn.Module): encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index cfe19ead2a..fac92eb866 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -81,6 +81,7 @@ class TFAutoModel(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. @@ -212,6 +213,7 @@ class TFAutoModelWithLMHead(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. @@ -338,6 +340,7 @@ class TFAutoModelForSequenceClassification(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. @@ -453,6 +456,7 @@ class TFAutoModelForQuestionAnswering(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 4a6d18f447..d9a93af21b 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -177,6 +177,7 @@ class TFPreTrainedModel(tf.keras.Model): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards. diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 37088f8e67..676f355986 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -266,6 +266,7 @@ class PreTrainedModel(nn.Module): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``. - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index f36a584521..1f0599ef7f 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -86,6 +86,7 @@ class AutoTokenizer(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. @@ -108,8 +109,14 @@ class AutoTokenizer(object): Examples:: - tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 and cache. - tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')` + # Download vocabulary from S3 and cache. + tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased') + + # Download vocabulary from S3 (user-uploaded) and cache. + tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased') + + # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) + tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/') """ if 'distilbert' in pretrained_model_name_or_path: diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 7e86742286..317ecd167b 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -255,7 +255,7 @@ class PreTrainedTokenizer(object): pretrained_model_name_or_path: either: - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``. - - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``. + - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``. - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``. @@ -284,7 +284,7 @@ class PreTrainedTokenizer(object): tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Download vocabulary from S3 (user-uploaded) and cache. - tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased') + tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased') # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`) tokenizer = BertTokenizer.from_pretrained('./test/saved_model/') From 7296f1010b6faaf3b1fb409bc5a9ebadcea51973 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Thu, 12 Dec 2019 13:01:04 -0500 Subject: [PATCH 121/302] Cleanup squad and add allow train_file and predict_file usage --- examples/run_squad.py | 22 ++++++++++++++-------- transformers/data/processors/squad.py | 6 ++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 79c8537a4b..117b86e32c 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -337,7 +337,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal else: logger.info("Creating features from dataset file at %s", input_dir) - if not args.data_dir: + if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)): try: import tensorflow_datasets as tfds except ImportError: @@ -350,7 +350,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate) else: processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor() - examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) + + if evaluate: + examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file) + else: + examples = processor.get_train_examples(args.data_dir, filename=args.train_file) features, dataset = squad_convert_examples_to_features( examples=examples, @@ -387,7 +391,14 @@ def main(): ## Other parameters parser.add_argument("--data_dir", default=None, type=str, - help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.") + help="The input data dir. Should contain the .json files for the task." + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") + parser.add_argument("--train_file", default=None, type=str, + help="The input training file. If a data dir is specified, will look for the file there" + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") + parser.add_argument("--predict_file", default=None, type=str, + help="The input evaluation file. If a data dir is specified, will look for the file there" + + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.") parser.add_argument("--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name") parser.add_argument("--tokenizer_name", default="", type=str, @@ -472,11 +483,6 @@ def main(): parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() - args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format( - list(filter(None, args.model_name_or_path.split('/'))).pop(), - str(args.max_seq_length)) - ) - if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir)) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3d7f832540..9bc4375684 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -373,6 +373,9 @@ class SquadProcessor(DataProcessor): which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ + if data_dir is None: + data_dir = "" + if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") @@ -389,6 +392,9 @@ class SquadProcessor(DataProcessor): filename: None by default, specify this if the evaluation file has a different name than the original one which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively. """ + if data_dir is None: + data_dir = "" + if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") From 9b312f9d41b85ed3a4cf68b8bb3c5126c6df2259 Mon Sep 17 00:00:00 2001 From: erenup Date: Fri, 13 Dec 2019 14:51:40 +0800 Subject: [PATCH 122/302] initial version for roberta squad --- examples/run_squad.py | 19 ++--- transformers/__init__.py | 2 +- transformers/data/metrics/squad_metrics.py | 14 ++-- transformers/data/processors/squad.py | 4 +- transformers/modeling_roberta.py | 86 ++++++++++++++++++++++ 5 files changed, 106 insertions(+), 19 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 117b86e32c..d124d07eb5 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -39,6 +39,7 @@ from tqdm import tqdm, trange from transformers import (WEIGHTS_NAME, BertConfig, BertForQuestionAnswering, BertTokenizer, + RobertaForQuestionAnswering, RobertaTokenizer, RobertaConfig, XLMConfig, XLMForQuestionAnswering, XLMTokenizer, XLNetConfig, XLNetForQuestionAnswering, @@ -53,10 +54,11 @@ from transformers import AdamW, get_linear_schedule_with_warmup, squad_convert_e logger = logging.getLogger(__name__) ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \ - for conf in (BertConfig, XLNetConfig, XLMConfig)), ()) + for conf in (BertConfig, RobertaConfig, XLNetConfig, XLMConfig)), ()) MODEL_CLASSES = { 'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer), + 'roberta': (RobertaConfig, RobertaForQuestionAnswering, RobertaTokenizer), 'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer), 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), @@ -141,13 +143,11 @@ def train(args, train_dataset, model, tokenizer): inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], + 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], 'start_positions': batch[3], - 'end_positions': batch[4] + 'end_positions': batch[4], } - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] - if args.model_type in ['xlnet', 'xlm']: inputs.update({'cls_index': batch[5], 'p_mask': batch[6]}) @@ -241,12 +241,9 @@ def evaluate(args, model, tokenizer, prefix=""): with torch.no_grad(): inputs = { 'input_ids': batch[0], - 'attention_mask': batch[1] + 'attention_mask': batch[1], + 'token_type_ids': None if args.model_type in ['xlm', 'roberta', 'distilbert'] else batch[2], } - - if args.model_type != 'distilbert': - inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2] # XLM don't use segment_ids - example_indices = batch[3] # XLNet and XLM use more arguments for their predictions @@ -311,7 +308,7 @@ def evaluate(args, model, tokenizer, prefix=""): predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, output_prediction_file, output_nbest_file, output_null_log_odds_file, args.verbose_logging, - args.version_2_with_negative, args.null_score_diff_threshold) + args.version_2_with_negative, args.null_score_diff_threshold, tokenizer) # Compute the F1 and exact scores. results = squad_evaluate(examples, predictions) diff --git a/transformers/__init__.py b/transformers/__init__.py index 5d7b0b772c..5353551e3e 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -99,7 +99,7 @@ if is_torch_available(): XLM_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_roberta import (RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification, RobertaForMultipleChoice, - RobertaForTokenClassification, + RobertaForTokenClassification, RobertaForQuestionAnswering, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_distilbert import (DistilBertPreTrainedModel, DistilBertForMaskedLM, DistilBertModel, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py index 7b03255f49..acbb884fb8 100644 --- a/transformers/data/metrics/squad_metrics.py +++ b/transformers/data/metrics/squad_metrics.py @@ -377,7 +377,8 @@ def compute_predictions_logits( output_null_log_odds_file, verbose_logging, version_2_with_negative, - null_score_diff_threshold + null_score_diff_threshold, + tokenizer, ): """Write final predictions to the json file and log-odds of null if needed.""" logger.info("Writing predictions to: %s" % (output_prediction_file)) @@ -474,11 +475,14 @@ def compute_predictions_logits( orig_doc_start = feature.token_to_orig_map[pred.start_index] orig_doc_end = feature.token_to_orig_map[pred.end_index] orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)] - tok_text = " ".join(tok_tokens) - # De-tokenize WordPieces that have been split off. - tok_text = tok_text.replace(" ##", "") - tok_text = tok_text.replace("##", "") + tok_text = tokenizer.convert_tokens_to_string(tok_tokens) + + # tok_text = " ".join(tok_tokens) + # + # # De-tokenize WordPieces that have been split off. + # tok_text = tok_text.replace(" ##", "") + # tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 9bc4375684..3f5fd46382 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -140,7 +140,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) - if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: @@ -155,7 +154,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, spans = [] truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ + if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index fc27353d37..2f6f634fa6 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -555,3 +555,89 @@ class RobertaClassificationHead(nn.Module): x = self.dropout(x) x = self.out_proj(x) return x + + +@add_start_docstrings("""Roberta Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of + the hidden-states output to compute `span start logits` and `span end logits`). """, + ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING) +class RobertaForQuestionAnswering(BertPreTrainedModel): + r""" + **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). + Position outside of the sequence are not taken into account for computing the loss. + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-start scores (before SoftMax). + **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)`` + Span-end scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + Examples:: + tokenizer = RobertaTokenizer.from_pretrained('roberta-base') + model = RobertaForMultipleChoice.from_pretrained('roberta-base') + input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 + start_positions = torch.tensor([1]) + end_positions = torch.tensor([3]) + outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) + loss, start_scores, end_scores = outputs[:2] + """ + config_class = RobertaConfig + pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + base_model_prefix = "roberta" + + def __init__(self, config): + super(RobertaForQuestionAnswering, self).__init__(config) + self.num_labels = config.num_labels + + self.roberta = RobertaModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, + start_positions=None, end_positions=None): + + outputs = self.roberta(input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + outputs = (start_logits, end_logits,) + outputs[2:] + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + outputs = (total_loss,) + outputs + + return outputs # (loss), start_logits, end_logits, (hidden_states), (attentions) \ No newline at end of file From 33e72b08d54bf5edd192492af7549b581563ecc2 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 13 Dec 2019 11:33:05 +0100 Subject: [PATCH 123/302] fix inner dimensions for 3B/11B models --- transformers/modeling_t5.py | 27 +++++++++++---------------- transformers/modeling_tf_t5.py | 20 ++++++++------------ 2 files changed, 19 insertions(+), 28 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 149b977abc..c9310179a3 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -30,7 +30,7 @@ from torch import nn import torch.nn.functional as F from torch.nn import CrossEntropyLoss, MSELoss -from .modeling_utils import PreTrainedModel +from .modeling_utils import PreTrainedModel, prune_linear_layer from .configuration_t5 import T5Config from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK @@ -191,28 +191,26 @@ class T5Attention(nn.Module): self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets - self.dim = config.d_model + self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads self.dropout = config.dropout_rate - assert self.dim % self.n_heads == 0 - assert self.dim // self.n_heads == self.d_kv + self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax - self.q = nn.Linear(self.dim, self.dim, bias=False) - self.k = nn.Linear(self.dim, self.dim, bias=False) - self.v = nn.Linear(self.dim, self.dim, bias=False) - self.o = nn.Linear(self.dim, self.dim, bias=False) + self.q = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.k = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.v = nn.Linear(self.d_model, self.inner_dim, bias=False) + self.o = nn.Linear(self.inner_dim, self.d_model, bias=False) if self.has_relative_attention_bias: self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads) self.pruned_heads = set() def prune_heads(self, heads): - attention_head_size = self.dim // self.n_heads if len(heads) == 0: return - mask = torch.ones(self.n_heads, attention_head_size) + mask = torch.ones(self.n_heads, self.d_kv) heads = set(heads) - self.pruned_heads for head in heads: head -= sum(1 if h < head else 0 for h in self.pruned_heads) @@ -226,7 +224,7 @@ class T5Attention(nn.Module): self.o = prune_linear_layer(self.o, index, dim=1) # Update hyper params self.n_heads = self.n_heads - len(heads) - self.dim = attention_head_size * self.n_heads + self.inner_dim = self.d_kv * self.n_heads self.pruned_heads = self.pruned_heads.union(heads) @staticmethod @@ -303,17 +301,14 @@ class T5Attention(nn.Module): klen = qlen if cache is None else cache['slen'] + qlen else: klen = kv.size(1) - # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) - n_heads = self.n_heads - dim_per_head = self.dim // n_heads def shape(x): """ projection """ - return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2) + return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2) def unshape(x): """ compute context """ - return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head) + return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index fd25328ac6..0ae7fff412 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -108,17 +108,16 @@ class TFT5Attention(tf.keras.layers.Layer): self.output_attentions = config.output_attentions self.relative_attention_num_buckets = config.relative_attention_num_buckets - self.dim = config.d_model + self.d_model = config.d_model self.d_kv = config.d_kv self.n_heads = config.num_heads - assert self.dim % self.n_heads == 0 - assert self.dim // self.n_heads == self.d_kv + self.inner_dim = self.n_heads * self.d_kv # Mesh TensorFlow initialization to avoid scaling before softmax - self.q = tf.keras.layers.Dense(self.dim, use_bias=False, name='q') - self.k = tf.keras.layers.Dense(self.dim, use_bias=False, name='k') - self.v = tf.keras.layers.Dense(self.dim, use_bias=False, name='v') - self.o = tf.keras.layers.Dense(self.dim, use_bias=False, name='o') + self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q') + self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k') + self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v') + self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o') self.dropout = tf.keras.layers.Dropout(config.dropout_rate) if self.has_relative_attention_bias: @@ -199,17 +198,14 @@ class TFT5Attention(tf.keras.layers.Layer): klen = qlen if cache is None else cache['slen'] + qlen else: klen = shape_list(kv)[1] - # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim) - n_heads = self.n_heads - dim_per_head = self.dim // n_heads def shape(x): """ projection """ - return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3)) + return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3)) def unshape(x): """ compute context """ - return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head)) + return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim)) q = shape(self.q(input)) # (bs, n_heads, qlen, dim_per_head) if kv is None: From 80eacb8f16208bcc7ffd8ed5b5750d6fc6854a24 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 14:10:22 +0100 Subject: [PATCH 124/302] Adding labels mapping for classification models in their respective config. --- transformers/configuration_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 97b9fa8f80..b7ddcf0912 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -58,8 +58,8 @@ class PretrainedConfig(object): self.use_bfloat16 = kwargs.pop('use_bfloat16', False) self.pruned_heads = kwargs.pop('pruned_heads', {}) self.is_decoder = kwargs.pop('is_decoder', False) - self.idx2label = kwargs.pop('idx2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) - self.label2idx = kwargs.pop('label2idx', dict(zip(self.idx2label.values(), self.idx2label.keys()))) + self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) + self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys()))) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it From be5bf7b81bd4171169e23091beda85ffd97f950f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 14:12:17 +0100 Subject: [PATCH 125/302] Added NER pipeline. --- transformers/pipelines.py | 720 ++++++++++++++++++++------------------ 1 file changed, 388 insertions(+), 332 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index da8b0b65a7..b0b5848c01 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -1,332 +1,388 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import absolute_import, division, print_function, unicode_literals - -import os -from abc import ABC, abstractmethod -from typing import Union, Optional, Tuple, List, Dict - -import numpy as np - -from transformers import is_tf_available, is_torch_available, logger, AutoTokenizer, PreTrainedTokenizer, \ - SquadExample, squad_convert_examples_to_features - -if is_tf_available(): - from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering - -if is_torch_available(): - from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering - - -class Pipeline(ABC): - def __init__(self, model, tokenizer: PreTrainedTokenizer = None, **kwargs): - self.model = model - self.tokenizer = tokenizer - - @classmethod - @abstractmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - raise NotImplementedError() - - def save_pretrained(self, save_directory): - if not os.path.isdir(save_directory): - logger.error("Provided path ({}) should be a directory".format(save_directory)) - return - - self.model.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) - - def transform(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) - - def predict(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) - - @abstractmethod - def __call__(self, *texts, **kwargs): - raise NotImplementedError() - - -class TextClassificationPipeline(Pipeline): - def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): - super().__init__(model, tokenizer) - - if nb_classes < 2: - raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) - self._nb_classes = nb_classes - - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - return cls(model, tokenizer, **kwargs) - - def __call__(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - if 'X' in kwargs and not texts: - texts = kwargs.pop('X') - - inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' - ) - - special_tokens_mask = inputs.pop('special_tokens_mask') - - if is_tf_available(): - # TODO trace model - predictions = self.model(**inputs)[0] - else: - import torch - with torch.no_grad(): - predictions = self.model(**inputs)[0] - - return predictions.numpy().tolist() - - -class QuestionAnsweringPipeline(Pipeline): - """ - Question Answering pipeling involving Tokenization and Inference. - """ - - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - pass - - @staticmethod - def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: - is_list = isinstance(question, list) - - if is_list: - return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] - else: - return SquadExample(None, question, context, None, None, None) - - @staticmethod - def handle_args(*inputs, **kwargs) -> List[SquadExample]: - # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating - if inputs is not None and len(inputs) > 1: - kwargs['X'] = inputs - - # Generic compatibility with sklearn and Keras - # Batched data - if 'X' in kwargs or 'data' in kwargs: - data = kwargs['X'] if 'X' in kwargs else kwargs['data'] - - if not isinstance(data, list): - data = [data] - - for i, item in enumerate(data): - if isinstance(item, dict): - if any(k not in item for k in ['question', 'context']): - raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') - data[i] = QuestionAnsweringPipeline.create_sample(**item) - - elif isinstance(item, SquadExample): - continue - else: - raise ValueError( - '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' - .format('X' if 'X' in kwargs else 'data') - ) - inputs = data - - # Tabular input - elif 'question' in kwargs and 'context' in kwargs: - if isinstance(kwargs['question'], str): - kwargs['question'] = [kwargs['question']] - - if isinstance(kwargs['context'], str): - kwargs['context'] = [kwargs['context']] - - inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] - else: - raise ValueError('Unknown arguments {}'.format(kwargs)) - - if not isinstance(inputs, list): - inputs = [inputs] - - return inputs - - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): - super().__init__(model, tokenizer) - - def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: - args = ['input_ids', 'attention_mask'] - model_type = type(self.model).__name__.lower() - - if 'distilbert' not in model_type and 'xlm' not in model_type: - args += ['token_type_ids'] - - if 'xlnet' in model_type or 'xlm' in model_type: - args += ['cls_index', 'p_mask'] - - if isinstance(features, SquadExample): - return {k: features.__dict__[k] for k in args} - else: - return {k: [feature.__dict__[k] for feature in features] for k in args} - - def __call__(self, *texts, **kwargs): - # Set defaults values - kwargs.setdefault('topk', 1) - kwargs.setdefault('doc_stride', 128) - kwargs.setdefault('max_answer_len', 15) - kwargs.setdefault('max_seq_len', 384) - kwargs.setdefault('max_question_len', 64) - - if kwargs['topk'] < 1: - raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk'])) - - if kwargs['max_answer_len'] < 1: - raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) - - examples = QuestionAnsweringPipeline.handle_args(texts, **kwargs) - - # Convert inputs to features - features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) - fw_args = self.inputs_for_model(features) - - if is_tf_available(): - import tensorflow as tf - fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} - start, end = self.model(fw_args) - start, end = start.numpy(), end.numpy() - else: - import torch - with torch.no_grad(): - # Retrieve the score for the context tokens only (removing question tokens) - fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} - start, end = self.model(**fw_args) - start, end = start.cpu().numpy(), end.cpu().numpy() - - answers = [] - for (example, feature, start_, end_) in zip(examples, features, start, end): - # Normalize logits and spans to retrieve the answer - start_ = np.exp(start_) / np.sum(np.exp(start_)) - end_ = np.exp(end_) / np.sum(np.exp(end_)) - - # Mask padding and question - start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) - - # Mask CLS - start_[0] = end_[0] = 0 - - starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) - char_to_word = np.array(example.char_to_word_offset) - - # Convert the answer (tokens) back to the original text - answers += [[ - { - 'score': score, - 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0], - 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1], - 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) - } - for s, e, score in zip(starts, ends, scores) - ]] - - return answers - - def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: - # Ensure we have batch axis - if start.ndim == 1: - start = start[None] - - if end.ndim == 1: - end = end[None] - - # Compute the score of each tuple(start, end) to be the real answer - outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) - - # Remove candidate with end < start and end - start > max_answer_len - candidates = np.tril(np.triu(outer), max_answer_len - 1) - - # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) - scores_flat = candidates.flatten() - if topk == 1: - idx_sort = [np.argmax(scores_flat)] - elif len(scores_flat) < topk: - idx_sort = np.argsort(-scores_flat) - else: - idx = np.argpartition(-scores_flat, topk)[0:topk] - idx_sort = idx[np.argsort(-scores_flat[idx])] - - start, end = np.unravel_index(idx_sort, candidates.shape)[1:] - return start, end, candidates[0, start, end] - - def span_to_answer(self, text: str, start: int, end: int): - words = [] - token_idx = char_start_idx = char_end_idx = chars_idx = 0 - - for i, word in enumerate(text.split(" ")): - token = self.tokenizer.tokenize(word) - - # Append words if they are in the span - if start <= token_idx <= end: - if token_idx == start: - char_start_idx = chars_idx - - if token_idx == end: - char_end_idx = chars_idx + len(word) - - words += [word] - - # Stop if we went over the end of the answer - if token_idx > end: - break - - # Append the subtokenization length to the running index - token_idx += len(token) - chars_idx += len(word) + 1 - - # Join text with spaces - return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)} - - -# Register all the supported task here -SUPPORTED_TASKS = { - 'text-classification': { - 'impl': TextClassificationPipeline, - 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, - 'pt': AutoModelForSequenceClassification if is_torch_available() else None - }, - 'question-answering': { - 'impl': QuestionAnsweringPipeline, - 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, - 'pt': AutoModelForQuestionAnswering if is_torch_available() else None - } -} - - -def pipeline(task: str, model, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: - """ - Utility factory method to build pipeline. - """ - # Try to infer tokenizer from model name (if provided as str) - if tokenizer is None and isinstance(model, str): - tokenizer = model - else: - # Impossible to guest what is the right tokenizer here - raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') - - tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) - - if task not in SUPPORTED_TASKS: - raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) - - targeted_task = SUPPORTED_TASKS[task] - task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] - - model = allocator.from_pretrained(model) - return task(model, tokenizer, **kwargs) +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +from abc import ABC, abstractmethod +from itertools import groupby +from typing import Union, Optional, Tuple, List, Dict + +import numpy as np + +from transformers import AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ + SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger + +if is_tf_available(): + from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification + +if is_torch_available(): + import torch + from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification + + +class Pipeline(ABC): + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, **kwargs): + self.model = model + self.tokenizer = tokenizer + + @classmethod + @abstractmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + raise NotImplementedError() + + def save_pretrained(self, save_directory): + if not os.path.isdir(save_directory): + logger.error("Provided path ({}) should be a directory".format(save_directory)) + return + + self.model.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + def transform(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + def predict(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + return self(*texts, **kwargs) + + @abstractmethod + def __call__(self, *texts, **kwargs): + raise NotImplementedError() + + +class TextClassificationPipeline(Pipeline): + def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): + super().__init__(model, tokenizer) + + if nb_classes < 2: + raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) + self._nb_classes = nb_classes + + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + return cls(model, tokenizer, **kwargs) + + def __call__(self, *texts, **kwargs): + # Generic compatibility with sklearn and Keras + if 'X' in kwargs and not texts: + texts = kwargs.pop('X') + + inputs = self.tokenizer.batch_encode_plus( + texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) + + special_tokens_mask = inputs.pop('special_tokens_mask') + + if is_tf_available(): + # TODO trace model + predictions = self.model(**inputs)[0] + else: + import torch + with torch.no_grad(): + predictions = self.model(**inputs)[0] + + return predictions.numpy().tolist() + + +class NerPipeline(Pipeline): + + def __init__(self, model, tokenizer: PreTrainedTokenizer): + super().__init__(model, tokenizer) + + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + pass + + def __call__(self, *texts, **kwargs): + (texts, ), answers = texts, [] + + for sentence in texts: + + # Ugly token to word idx mapping (for now) + token_to_word, words = [], sentence.split(' ') + for i, w in enumerate(words): + tokens = self.tokenizer.tokenize(w) + token_to_word += [i] * len(tokens) + tokens = self.tokenizer.encode_plus(sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt') + + # Forward + if is_torch_available(): + with torch.no_grad(): + entities = self.model(**tokens)[0][0].cpu().numpy() + else: + entities = self.model(tokens)[0][0].numpy() + + # Normalize scores + answer, token_start = [], 1 + for idx, word in groupby(token_to_word[1:-1]): + + # Sum log prob over token, then normalize across labels + score = np.exp(entities[token_start]) / np.exp(entities[token_start]).sum(-1, keepdims=True) + label_idx = score.argmax() + + answer += [{ + 'word': words[idx - 1], 'score': score[label_idx], 'entity': self.model.config.id2label[label_idx] + }] + + # Update token start + token_start += len(list(word)) + + # Append + answers += [answer] + return answers + + +class QuestionAnsweringPipeline(Pipeline): + """ + Question Answering pipeline involving Tokenization and Inference. + """ + + @classmethod + def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): + pass + + @staticmethod + def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: + is_list = isinstance(question, list) + + if is_list: + return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] + else: + return SquadExample(None, question, context, None, None, None) + + @staticmethod + def handle_args(*inputs, **kwargs) -> List[SquadExample]: + # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating + if inputs is not None and len(inputs) > 1: + kwargs['X'] = inputs + + # Generic compatibility with sklearn and Keras + # Batched data + if 'X' in kwargs or 'data' in kwargs: + data = kwargs['X'] if 'X' in kwargs else kwargs['data'] + + if not isinstance(data, list): + data = [data] + + for i, item in enumerate(data): + if isinstance(item, dict): + if any(k not in item for k in ['question', 'context']): + raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') + data[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif isinstance(item, SquadExample): + continue + else: + raise ValueError( + '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' + .format('X' if 'X' in kwargs else 'data') + ) + inputs = data + + # Tabular input + elif 'question' in kwargs and 'context' in kwargs: + if isinstance(kwargs['question'], str): + kwargs['question'] = [kwargs['question']] + + if isinstance(kwargs['context'], str): + kwargs['context'] = [kwargs['context']] + + inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] + else: + raise ValueError('Unknown arguments {}'.format(kwargs)) + + if not isinstance(inputs, list): + inputs = [inputs] + + return inputs + + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): + super().__init__(model, tokenizer) + + def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: + args = ['input_ids', 'attention_mask'] + model_type = type(self.model).__name__.lower() + + if 'distilbert' not in model_type and 'xlm' not in model_type: + args += ['token_type_ids'] + + if 'xlnet' in model_type or 'xlm' in model_type: + args += ['cls_index', 'p_mask'] + + if isinstance(features, SquadExample): + return {k: features.__dict__[k] for k in args} + else: + return {k: [feature.__dict__[k] for feature in features] for k in args} + + def __call__(self, *texts, **kwargs): + # Set defaults values + kwargs.setdefault('topk', 1) + kwargs.setdefault('doc_stride', 128) + kwargs.setdefault('max_answer_len', 15) + kwargs.setdefault('max_seq_len', 384) + kwargs.setdefault('max_question_len', 64) + + if kwargs['topk'] < 1: + raise ValueError('topk parameter should be >= 1 (got {})'.format(kwargs['topk'])) + + if kwargs['max_answer_len'] < 1: + raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) + + examples = QuestionAnsweringPipeline.handle_args(texts, **kwargs) + + # Convert inputs to features + features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) + fw_args = self.inputs_for_model(features) + + if is_tf_available(): + import tensorflow as tf + fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} + start, end = self.model(fw_args) + start, end = start.numpy(), end.numpy() + else: + import torch + with torch.no_grad(): + # Retrieve the score for the context tokens only (removing question tokens) + fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} + start, end = self.model(**fw_args) + start, end = start.cpu().numpy(), end.cpu().numpy() + + answers = [] + for (example, feature, start_, end_) in zip(examples, features, start, end): + # Normalize logits and spans to retrieve the answer + start_ = np.exp(start_) / np.sum(np.exp(start_)) + end_ = np.exp(end_) / np.sum(np.exp(end_)) + + # Mask padding and question + start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) + + # TODO : What happend if not possible + # Mask CLS + start_[0] = end_[0] = 0 + + starts, ends, scores = self.decode(start_, end_, kwargs['topk'], kwargs['max_answer_len']) + char_to_word = np.array(example.char_to_word_offset) + + # Convert the answer (tokens) back to the original text + answers += [[ + { + 'score': score, + 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0], + 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1], + 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) + } + for s, e, score in zip(starts, ends, scores) + ]] + + return answers + + def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: + # Ensure we have batch axis + if start.ndim == 1: + start = start[None] + + if end.ndim == 1: + end = end[None] + + # Compute the score of each tuple(start, end) to be the real answer + outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) + + # Remove candidate with end < start and end - start > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) + + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + start, end = np.unravel_index(idx_sort, candidates.shape)[1:] + return start, end, candidates[0, start, end] + + def span_to_answer(self, text: str, start: int, end: int): + words = [] + token_idx = char_start_idx = char_end_idx = chars_idx = 0 + + for i, word in enumerate(text.split(" ")): + token = self.tokenizer.tokenize(word) + + # Append words if they are in the span + if start <= token_idx <= end: + if token_idx == start: + char_start_idx = chars_idx + + if token_idx == end: + char_end_idx = chars_idx + len(word) + + words += [word] + + # Stop if we went over the end of the answer + if token_idx > end: + break + + # Append the subtokenization length to the running index + token_idx += len(token) + chars_idx += len(word) + 1 + + # Join text with spaces + return {'answer': ' '.join(words), 'start': max(0, char_start_idx), 'end': min(len(text), char_end_idx)} + + +# Register all the supported task here +SUPPORTED_TASKS = { + 'text-classification': { + 'impl': TextClassificationPipeline, + 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, + 'pt': AutoModelForSequenceClassification if is_torch_available() else None + }, + 'ner': { + 'impl': NerPipeline, + 'tf': TFAutoModelForTokenClassification if is_tf_available() else None, + 'pt': AutoModelForTokenClassification if is_torch_available() else None, + }, + 'question-answering': { + 'impl': QuestionAnsweringPipeline, + 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, + 'pt': AutoModelForQuestionAnswering if is_torch_available() else None + } +} + + +def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: + """ + Utility factory method to build pipeline. + """ + # Try to infer tokenizer from model name (if provided as str) + if tokenizer is None and isinstance(model, str): + tokenizer = model + else: + # Impossible to guest what is the right tokenizer here + raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') + + tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) + + if task not in SUPPORTED_TASKS: + raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) + + targeted_task = SUPPORTED_TASKS[task] + task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + + model = allocator.from_pretrained(model) + return task(model, tokenizer, **kwargs) From 28e64ad5a4b01a1b7de092694e3a321edf7021bd Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 14:12:54 +0100 Subject: [PATCH 126/302] Raise an exception if the pipeline allocator can't determine the tokenizer from the model. --- transformers/pipelines.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index b0b5848c01..853735a256 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -370,11 +370,12 @@ def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokeni Utility factory method to build pipeline. """ # Try to infer tokenizer from model name (if provided as str) - if tokenizer is None and isinstance(model, str): - tokenizer = model - else: - # Impossible to guest what is the right tokenizer here - raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') + if not isinstance(tokenizer, PreTrainedTokenizer): + if not isinstance(model, str): + # Impossible to guest what is the right tokenizer here + raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') + else: + tokenizer = model tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) From 1ca52567a4059e7ee1707de6a855bb5e7fb3fac3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 14:13:14 +0100 Subject: [PATCH 127/302] Allow model conversion in the pipeline allocator. --- transformers/pipelines.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 853735a256..9acd9bc566 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -385,5 +385,17 @@ def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokeni targeted_task = SUPPORTED_TASKS[task] task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] - model = allocator.from_pretrained(model) + # Special handling for model conversion + from_tf = model.endswith('.h5') and not is_tf_available() + from_pt = model.endswith('.bin') and not is_torch_available() + + if from_tf: + logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. Trying to load the model with PyTorch.') + elif from_pt: + logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. Trying to load the model with Tensorflow.') + + if allocator.__name__.startswith('TF'): + model = allocator.from_pretrained(model, config=config, from_pt=from_pt) + else: + model = allocator.from_pretrained(model, config=config, from_tf=from_tf) return task(model, tokenizer, **kwargs) From 8938b546bf5f61dcb65fb6dd72b5b924f773c46a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 14:27:04 +0100 Subject: [PATCH 128/302] Removed from_config --- transformers/pipelines.py | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 9acd9bc566..6fbb7e2f04 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -37,11 +37,6 @@ class Pipeline(ABC): self.model = model self.tokenizer = tokenizer - @classmethod - @abstractmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - raise NotImplementedError() - def save_pretrained(self, save_directory): if not os.path.isdir(save_directory): logger.error("Provided path ({}) should be a directory".format(save_directory)) @@ -63,6 +58,12 @@ class Pipeline(ABC): raise NotImplementedError() +class FeatureExtractionPipeline(Pipeline): + + def __call__(self, *texts, **kwargs): + pass + + class TextClassificationPipeline(Pipeline): def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): super().__init__(model, tokenizer) @@ -71,10 +72,6 @@ class TextClassificationPipeline(Pipeline): raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) self._nb_classes = nb_classes - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - return cls(model, tokenizer, **kwargs) - def __call__(self, *texts, **kwargs): # Generic compatibility with sklearn and Keras if 'X' in kwargs and not texts: @@ -102,10 +99,6 @@ class NerPipeline(Pipeline): def __init__(self, model, tokenizer: PreTrainedTokenizer): super().__init__(model, tokenizer) - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - pass - def __call__(self, *texts, **kwargs): (texts, ), answers = texts, [] From 47f0e3cfb7df192ab80215cea9096791fce08694 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 13 Dec 2019 14:33:24 +0100 Subject: [PATCH 129/302] cleaning up configuration classes --- .../summarization/configuration_bertabs.py | 10 +-- .../adding_a_new_model/configuration_xxx.py | 12 +-- .../tests/modeling_tf_xxx_test.py | 2 +- .../tests/modeling_xxx_test.py | 2 +- transformers/configuration_albert.py | 6 +- transformers/configuration_bert.py | 38 +++----- transformers/configuration_ctrl.py | 23 +---- transformers/configuration_distilbert.py | 40 ++++----- transformers/configuration_gpt2.py | 55 ++++-------- transformers/configuration_openai.py | 57 +++++------- transformers/configuration_transfo_xl.py | 26 ++---- transformers/configuration_utils.py | 27 ++++-- transformers/configuration_xlm.py | 88 ++++++++----------- transformers/configuration_xlnet.py | 81 +++++++---------- ..._original_pytorch_checkpoint_to_pytorch.py | 2 +- transformers/modeling_gpt2.py | 1 + transformers/modeling_tf_gpt2.py | 1 + transformers/modeling_tf_transfo_xl.py | 6 +- .../modeling_tf_transfo_xl_utilities.py | 12 +-- transformers/modeling_tf_xlnet.py | 2 +- transformers/modeling_transfo_xl.py | 10 +-- transformers/modeling_xlnet.py | 4 +- transformers/tests/modeling_albert_test.py | 2 +- transformers/tests/modeling_bert_test.py | 2 +- transformers/tests/modeling_common_test.py | 2 +- transformers/tests/modeling_ctrl_test.py | 2 +- .../tests/modeling_distilbert_test.py | 2 +- transformers/tests/modeling_gpt2_test.py | 2 +- transformers/tests/modeling_openai_test.py | 2 +- transformers/tests/modeling_roberta_test.py | 2 +- transformers/tests/modeling_tf_albert_test.py | 2 +- transformers/tests/modeling_tf_bert_test.py | 2 +- transformers/tests/modeling_tf_ctrl_test.py | 2 +- .../tests/modeling_tf_distilbert_test.py | 2 +- transformers/tests/modeling_tf_gpt2_test.py | 2 +- .../tests/modeling_tf_openai_gpt_test.py | 2 +- .../tests/modeling_tf_roberta_test.py | 2 +- .../tests/modeling_tf_transfo_xl_test.py | 2 +- transformers/tests/modeling_tf_xlm_test.py | 2 +- transformers/tests/modeling_tf_xlnet_test.py | 5 +- .../tests/modeling_transfo_xl_test.py | 2 +- transformers/tests/modeling_xlm_test.py | 2 +- transformers/tests/modeling_xlnet_test.py | 5 +- 43 files changed, 224 insertions(+), 329 deletions(-) diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py index 5bcb65b423..054763ea93 100644 --- a/examples/summarization/configuration_bertabs.py +++ b/examples/summarization/configuration_bertabs.py @@ -65,7 +65,7 @@ class BertAbsConfig(PretrainedConfig): def __init__( self, - vocab_size_or_config_json_file=30522, + vocab_size=30522, max_pos=512, enc_layers=6, enc_hidden_size=512, @@ -81,14 +81,14 @@ class BertAbsConfig(PretrainedConfig): ): super(BertAbsConfig, self).__init__(**kwargs) - if self._input_is_path_to_json(vocab_size_or_config_json_file): - path_to_json = vocab_size_or_config_json_file + if self._input_is_path_to_json(vocab_size): + path_to_json = vocab_size with open(path_to_json, "r", encoding="utf-8") as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file + elif isinstance(vocab_size, int): + self.vocab_size = vocab_size self.max_pos = max_pos self.enc_layers = enc_layers diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py index b1614e71af..ca9e0d554b 100644 --- a/templates/adding_a_new_model/configuration_xxx.py +++ b/templates/adding_a_new_model/configuration_xxx.py @@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig): Arguments: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`. + vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`. hidden_size: Size of the encoder layers and the pooler layer. num_hidden_layers: Number of hidden layers in the Transformer encoder. num_attention_heads: Number of attention heads for each attention layer in @@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig): pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=50257, + vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=768, @@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig): summary_first_dropout=0.1, **kwargs): super(XxxConfig, self).__init__(**kwargs) - self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1 + self.vocab_size = vocab_size if isinstance(vocab_size, six.string_types) else -1 self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd @@ -102,12 +102,12 @@ class XxxConfig(PretrainedConfig): self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels - if isinstance(vocab_size_or_config_json_file, six.string_types): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: + if isinstance(vocab_size, six.string_types): + with open(vocab_size, "r", encoding="utf-8") as reader: json_config = json.loads(reader.read()) for key, value in json_config.items(): self.__dict__[key] = value - elif not isinstance(vocab_size_or_config_json_file, int): + elif not isinstance(vocab_size, int): raise ValueError( "First argument must be either a vocabulary size (int)" "or the path to a pretrained model config file (str)" diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py index d7e576bf8b..912a4aa340 100644 --- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py @@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XxxConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py index bfc70921cd..30e614b3f2 100644 --- a/templates/adding_a_new_model/tests/modeling_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py @@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = XxxConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py index de665c9b1c..6a1ef78dd5 100644 --- a/transformers/configuration_albert.py +++ b/transformers/configuration_albert.py @@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig): pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=30000, + vocab_size=30000, embedding_size=128, hidden_size=4096, num_hidden_layers=12, @@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig): """ super(AlbertConfig, self).__init__(**kwargs) - self.vocab_size = vocab_size_or_config_json_file + self.vocab_size = vocab_size self.embedding_size = embedding_size self.hidden_size = hidden_size self.num_hidden_layers = num_hidden_layers @@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig): self.max_position_embeddings = max_position_embeddings self.type_vocab_size = type_vocab_size self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps \ No newline at end of file + self.layer_norm_eps = layer_norm_eps diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index 01fcd88cb8..9072820bce 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -56,7 +56,7 @@ class BertConfig(PretrainedConfig): Arguments: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`. + vocab_size: Vocabulary size of `inputs_ids` in `BertModel`. hidden_size: Size of the encoder layers and the pooler layer. num_hidden_layers: Number of hidden layers in the Transformer encoder. num_attention_heads: Number of attention heads for each attention layer in @@ -81,7 +81,7 @@ class BertConfig(PretrainedConfig): pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=30522, + vocab_size=30522, hidden_size=768, num_hidden_layers=12, num_attention_heads=12, @@ -95,25 +95,15 @@ class BertConfig(PretrainedConfig): layer_norm_eps=1e-12, **kwargs): super(BertConfig, self).__init__(**kwargs) - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.hidden_size = hidden_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.hidden_act = hidden_act - self.intermediate_size = intermediate_size - self.hidden_dropout_prob = hidden_dropout_prob - self.attention_probs_dropout_prob = attention_probs_dropout_prob - self.max_position_embeddings = max_position_embeddings - self.type_vocab_size = type_vocab_size - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py index fcbd848dec..f9b9e409e1 100644 --- a/transformers/configuration_ctrl.py +++ b/transformers/configuration_ctrl.py @@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig): """Configuration class to store the configuration of a `CTRLModel`. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). dff: Size of the inner dimension of the FFN. @@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig): def __init__( self, - vocab_size_or_config_json_file=246534, + vocab_size=246534, n_positions=256, n_ctx=256, n_embd=1280, @@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig): attn_pdrop=0.1, layer_norm_epsilon=1e-6, initializer_range=0.02, - - num_labels=1, summary_type='cls_index', summary_use_proj=True, summary_activation=None, @@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig): """Constructs CTRLConfig. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). dff: Size of the inner dimension of the FFN. @@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig): initializing all weight matrices. """ super(CTRLConfig, self).__init__(**kwargs) - - self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 + self.vocab_size = vocab_size self.n_ctx = n_ctx self.n_positions = n_positions self.n_embd = n_embd @@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig): self.layer_norm_epsilon = layer_norm_epsilon self.initializer_range = initializer_range - self.num_labels = num_labels self.summary_type = summary_type self.summary_use_proj = summary_use_proj self.summary_activation = summary_activation self.summary_first_dropout = summary_first_dropout self.summary_proj_to_labels = summary_proj_to_labels - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif not isinstance(vocab_size_or_config_json_file, int): - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) @property def max_position_embeddings(self): diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py index d5d575be29..d9f7cc6348 100644 --- a/transformers/configuration_distilbert.py +++ b/transformers/configuration_distilbert.py @@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig): pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=30522, + vocab_size=30522, max_position_embeddings=512, sinusoidal_pos_embds=False, n_layers=6, @@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig): seq_classif_dropout=0.2, **kwargs): super(DistilBertConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.sinusoidal_pos_embds = sinusoidal_pos_embds + self.n_layers = n_layers + self.n_heads = n_heads + self.dim = dim + self.hidden_dim = hidden_dim + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation = activation + self.initializer_range = initializer_range + self.tie_weights_ = tie_weights_ + self.qa_dropout = qa_dropout + self.seq_classif_dropout = seq_classif_dropout - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.max_position_embeddings = max_position_embeddings - self.sinusoidal_pos_embds = sinusoidal_pos_embds - self.n_layers = n_layers - self.n_heads = n_heads - self.dim = dim - self.hidden_dim = hidden_dim - self.dropout = dropout - self.attention_dropout = attention_dropout - self.activation = activation - self.initializer_range = initializer_range - self.tie_weights_ = tie_weights_ - self.qa_dropout = qa_dropout - self.seq_classif_dropout = seq_classif_dropout - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") @property def hidden_size(self): return self.dim diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py index c2fb4948d3..4c200c0760 100644 --- a/transformers/configuration_gpt2.py +++ b/transformers/configuration_gpt2.py @@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig): """Configuration class to store the configuration of a `GPT2Model`. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. @@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig): def __init__( self, - vocab_size_or_config_json_file=50257, + vocab_size=50257, n_positions=1024, n_ctx=1024, n_embd=768, @@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig): attn_pdrop=0.1, layer_norm_epsilon=1e-5, initializer_range=0.02, - - num_labels=1, summary_type='cls_index', summary_use_proj=True, summary_activation=None, @@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig): """Constructs GPT2Config. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. @@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig): initializing all weight matrices. """ super(GPT2Config, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - else: - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) + self.vocab_size = vocab_size + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py index 886b7f5bc5..7776a0bb9f 100644 --- a/transformers/configuration_openai.py +++ b/transformers/configuration_openai.py @@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig): Configuration class to store the configuration of a `OpenAIGPTModel`. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file. n_positions: Number of positional embeddings. n_ctx: Size of the causal mask (usually same as n_positions). n_embd: Dimensionality of the embeddings and hidden states. @@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig): def __init__( self, - vocab_size_or_config_json_file=40478, + vocab_size=40478, n_positions=512, n_ctx=512, n_embd=768, @@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig): layer_norm_epsilon=1e-5, initializer_range=0.02, predict_special_tokens=True, - - num_labels=1, summary_type='cls_index', summary_use_proj=True, summary_activation=None, @@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig): """Constructs OpenAIGPTConfig. """ super(OpenAIGPTConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.vocab_size = vocab_size_or_config_json_file - self.n_ctx = n_ctx - self.n_positions = n_positions - self.n_embd = n_embd - self.n_layer = n_layer - self.n_head = n_head - self.afn = afn - self.resid_pdrop = resid_pdrop - self.embd_pdrop = embd_pdrop - self.attn_pdrop = attn_pdrop - self.layer_norm_epsilon = layer_norm_epsilon - self.initializer_range = initializer_range - self.predict_special_tokens = predict_special_tokens - - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_first_dropout = summary_first_dropout - self.summary_proj_to_labels = summary_proj_to_labels - else: - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) + self.vocab_size = vocab_size + self.n_ctx = n_ctx + self.n_positions = n_positions + self.n_embd = n_embd + self.n_layer = n_layer + self.n_head = n_head + self.afn = afn + self.resid_pdrop = resid_pdrop + self.embd_pdrop = embd_pdrop + self.attn_pdrop = attn_pdrop + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.predict_special_tokens = predict_special_tokens + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_first_dropout = summary_first_dropout + self.summary_proj_to_labels = summary_proj_to_labels @property def max_position_embeddings(self): diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py index d55a6adbe6..52f0f45a50 100644 --- a/transformers/configuration_transfo_xl.py +++ b/transformers/configuration_transfo_xl.py @@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig): """Configuration class to store the configuration of a `TransfoXLModel`. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. + vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file. cutoffs: cutoffs for the adaptive softmax d_model: Dimensionality of the model's hidden states. d_embed: Dimensionality of the embeddings @@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig): pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=267735, + vocab_size=267735, cutoffs=[20000, 40000, 200000], d_model=1024, d_embed=1024, @@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig): """Constructs TransfoXLConfig. """ super(TransfoXLConfig, self).__init__(**kwargs) - self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 + self.vocab_size = vocab_size self.cutoffs = [] self.cutoffs.extend(cutoffs) self.tie_weight = tie_weight @@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig): self.init_std = init_std self.layer_norm_epsilon = layer_norm_epsilon - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif not isinstance(vocab_size_or_config_json_file, int): - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") - @property def max_position_embeddings(self): return self.tgt_len + self.ext_len + self.mem_len @property - def vocab_size(self): - return self.n_token + def n_token(self): # Backward compatibility + return self.vocab_size - @vocab_size.setter - def vocab_size(self, value): - self.n_token = value + @n_token.setter + def n_token(self, value): # Backward compatibility + self.vocab_size = value @property def hidden_size(self): diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 82959adb57..6c9eeea175 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -49,8 +49,7 @@ class PretrainedConfig(object): pretrained_config_archive_map = {} def __init__(self, **kwargs): - self.finetuning_task = kwargs.pop('finetuning_task', None) - self.num_labels = kwargs.pop('num_labels', 2) + # Attributes with defaults self.output_attentions = kwargs.pop('output_attentions', False) self.output_hidden_states = kwargs.pop('output_hidden_states', False) self.output_past = kwargs.pop('output_past', True) # Not used by all models @@ -59,6 +58,22 @@ class PretrainedConfig(object): self.pruned_heads = kwargs.pop('pruned_heads', {}) self.is_decoder = kwargs.pop('is_decoder', False) + # Fine-tuning task arguments + self.finetuning_task = kwargs.pop('finetuning_task', None) + self.num_labels = kwargs.pop('num_labels', 2) + self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) + self.id2label = dict((int(key), value) for key, value in self.id2label.items()) + self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys()))) + self.label2id = dict((key, int(value)) for key, value in self.label2id.items()) + + # Additional attributes without default values + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error("Can't set {} with value {} for {}".format(key, value, self)) + raise err + def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. @@ -183,17 +198,15 @@ class PretrainedConfig(object): @classmethod def from_dict(cls, json_object): """Constructs a `Config` from a Python dictionary of parameters.""" - config = cls(vocab_size_or_config_json_file=-1) - for key, value in json_object.items(): - setattr(config, key, value) - return config + return cls(**json_object) @classmethod def from_json_file(cls, json_file): """Constructs a `Config` from a json file of parameters.""" with open(json_file, "r", encoding='utf-8') as reader: text = reader.read() - return cls.from_dict(json.loads(text)) + dict_obj = json.loads(text) + return cls(**dict_obj) def __eq__(self, other): return self.__dict__ == other.__dict__ diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py index fa3a5f40f6..0740cc4026 100644 --- a/transformers/configuration_xlm.py +++ b/transformers/configuration_xlm.py @@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig): """Configuration class to store the configuration of a `XLMModel`. Args: - vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`. + vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`. d_model: Size of the encoder layers and the pooler layer. n_layer: Number of hidden layers in the Transformer encoder. n_head: Number of attention heads for each attention layer in @@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig): pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=30145, + vocab_size=30145, emb_dim=2048, n_layers=12, n_heads=16, @@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig): unk_index=3, mask_index=5, is_encoder=True, - - finetuning_task=None, - num_labels=2, summary_type='first', summary_use_proj=True, summary_activation=None, @@ -117,56 +114,43 @@ class XLMConfig(PretrainedConfig): """Constructs XLMConfig. """ super(XLMConfig, self).__init__(**kwargs) - - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size_or_config_json_file, int): - self.n_words = vocab_size_or_config_json_file - self.emb_dim = emb_dim - self.n_layers = n_layers - self.n_heads = n_heads - self.dropout = dropout - self.attention_dropout = attention_dropout - self.gelu_activation = gelu_activation - self.sinusoidal_embeddings = sinusoidal_embeddings - self.causal = causal - self.asm = asm - self.n_langs = n_langs - self.use_lang_emb = use_lang_emb - self.layer_norm_eps = layer_norm_eps - self.bos_index = bos_index - self.eos_index = eos_index - self.pad_index = pad_index - self.unk_index = unk_index - self.mask_index = mask_index - self.is_encoder = is_encoder - self.max_position_embeddings = max_position_embeddings - self.embed_init_std = embed_init_std - self.init_std = init_std - self.finetuning_task = finetuning_task - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_proj_to_labels = summary_proj_to_labels - self.summary_first_dropout = summary_first_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") + self.vocab_size = vocab_size + self.emb_dim = emb_dim + self.n_layers = n_layers + self.n_heads = n_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.gelu_activation = gelu_activation + self.sinusoidal_embeddings = sinusoidal_embeddings + self.causal = causal + self.asm = asm + self.n_langs = n_langs + self.use_lang_emb = use_lang_emb + self.layer_norm_eps = layer_norm_eps + self.bos_index = bos_index + self.eos_index = eos_index + self.pad_index = pad_index + self.unk_index = unk_index + self.mask_index = mask_index + self.is_encoder = is_encoder + self.max_position_embeddings = max_position_embeddings + self.embed_init_std = embed_init_std + self.init_std = init_std + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_proj_to_labels = summary_proj_to_labels + self.summary_first_dropout = summary_first_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top @property - def vocab_size(self): - return self.n_words + def n_words(self): # For backward compatibility + return self.vocab_size - @vocab_size.setter - def vocab_size(self, value): - self.n_words = value + @n_words.setter + def n_words(self, value): # For backward compatibility + self.vocab_size = value @property def hidden_size(self): diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py index 0dbf518849..017c57cfd5 100644 --- a/transformers/configuration_xlnet.py +++ b/transformers/configuration_xlnet.py @@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig): """Configuration class to store the configuration of a ``XLNetModel``. Args: - vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. + vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``. d_model: Size of the encoder layers and the pooler layer. n_layer: Number of hidden layers in the Transformer encoder. n_head: Number of attention heads for each attention layer in @@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig): pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=32000, + vocab_size=32000, d_model=1024, n_layer=24, n_head=16, d_inner=4096, - max_position_embeddings=512, ff_activation="gelu", untie_r=True, attn_type="bi", - initializer_range=0.02, layer_norm_eps=1e-12, - dropout=0.1, mem_len=None, reuse_len=None, bi_data=False, clamp_len=-1, same_length=False, - - finetuning_task=None, - num_labels=2, summary_type='last', summary_use_proj=True, summary_activation='tanh', @@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig): """Constructs XLNetConfig. """ super(XLNetConfig, self).__init__(**kwargs) + self.vocab_size = vocab_size + self.d_model = d_model + self.n_layer = n_layer + self.n_head = n_head + assert d_model % n_head == 0 + self.d_head = d_model // n_head + self.ff_activation = ff_activation + self.d_inner = d_inner + self.untie_r = untie_r + self.attn_type = attn_type - if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2 - and isinstance(vocab_size_or_config_json_file, unicode)): - with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - setattr(config, key, value) - elif isinstance(vocab_size_or_config_json_file, int): - self.n_token = vocab_size_or_config_json_file - self.d_model = d_model - self.n_layer = n_layer - self.n_head = n_head - assert d_model % n_head == 0 - self.d_head = d_model // n_head - self.ff_activation = ff_activation - self.d_inner = d_inner - self.untie_r = untie_r - self.attn_type = attn_type + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps - self.initializer_range = initializer_range - self.layer_norm_eps = layer_norm_eps + self.dropout = dropout + self.mem_len = mem_len + self.reuse_len = reuse_len + self.bi_data = bi_data + self.clamp_len = clamp_len + self.same_length = same_length - self.dropout = dropout - self.mem_len = mem_len - self.reuse_len = reuse_len - self.bi_data = bi_data - self.clamp_len = clamp_len - self.same_length = same_length - - self.finetuning_task = finetuning_task - self.num_labels = num_labels - self.summary_type = summary_type - self.summary_use_proj = summary_use_proj - self.summary_activation = summary_activation - self.summary_last_dropout = summary_last_dropout - self.start_n_top = start_n_top - self.end_n_top = end_n_top - else: - raise ValueError("First argument must be either a vocabulary size (int)" - " or the path to a pretrained model config file (str)") + self.summary_type = summary_type + self.summary_use_proj = summary_use_proj + self.summary_activation = summary_activation + self.summary_last_dropout = summary_last_dropout + self.start_n_top = start_n_top + self.end_n_top = end_n_top @property def max_position_embeddings(self): return -1 @property - def vocab_size(self): - return self.n_token + def n_token(self): # Backward compatibility + return self.vocab_size - @vocab_size.setter - def vocab_size(self, value): - self.n_token = value + @n_token.setter + def n_token(self, value): # Backward compatibility + self.vocab_size = value @property def hidden_size(self): diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index 60935add60..b4dc1bb61b 100644 --- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -46,7 +46,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout config = BertConfig( - vocab_size_or_config_json_file=50265, + vocab_size=50265, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py index 96fd1c0607..ea660262d7 100644 --- a/transformers/modeling_gpt2.py +++ b/transformers/modeling_gpt2.py @@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel): """ def __init__(self, config): super(GPT2DoubleHeadsModel, self).__init__(config) + config.num_labels = 1 self.transformer = GPT2Model(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py index c738e5e8e3..973473179f 100644 --- a/transformers/modeling_tf_gpt2.py +++ b/transformers/modeling_tf_gpt2.py @@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs) + config.num_labels = 1 self.transformer = TFGPT2MainLayer(config, name='transformer') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py index fd325e218e..848edfa37a 100644 --- a/transformers/modeling_tf_transfo_xl.py +++ b/transformers/modeling_tf_transfo_xl.py @@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.n_token = config.n_token + self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model @@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer): self.d_head = config.d_head self.untie_r = config.untie_r - self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, + self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, init_std=config.init_std, name='word_emb') self.drop = tf.keras.layers.Dropout(config.dropout) @@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel): raise NotImplementedError # use adaptive softmax (including standard softmax) else: - self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, + self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name='crit') def reset_length(self, tgt_len, ext_len, mem_len): diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py index e6a6dfe686..f730af851f 100644 --- a/transformers/modeling_tf_transfo_xl_utilities.py +++ b/transformers/modeling_tf_transfo_xl_utilities.py @@ -25,15 +25,15 @@ import tensorflow as tf from .modeling_tf_utils import shape_list class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): - def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, + def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs): super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs) - self.n_token = n_token + self.vocab_size = vocab_size self.d_embed = d_embed self.d_proj = d_proj - self.cutoffs = cutoffs + [n_token] + self.cutoffs = cutoffs + [vocab_size] self.cutoff_ends = [0] + self.cutoffs self.div_val = div_val @@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): self.out_projs.append(weight) else: self.out_projs.append(None) - weight = self.add_weight(shape=(self.n_token, self.d_embed,), + weight = self.add_weight(shape=(self.vocab_size, self.d_embed,), initializer='zeros', trainable=True, name='out_layers_._{}_._weight'.format(i)) - bias = self.add_weight(shape=(self.n_token,), + bias = self.add_weight(shape=(self.vocab_size,), initializer='zeros', trainable=True, name='out_layers_._{}_._bias'.format(i)) @@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer): hidden, target = inputs head_logprob = 0 if self.n_clusters == 0: - softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer()) + softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer()) output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0]) if target is not None: loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output) diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py index 759b57d835..dde2b6a8df 100644 --- a/transformers/modeling_tf_xlnet.py +++ b/transformers/modeling_tf_xlnet.py @@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer): self.use_bfloat16 = config.use_bfloat16 self.initializer_range = config.initializer_range - self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding') + self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding') self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)] self.dropout = tf.keras.layers.Dropout(config.dropout) diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py index a6a82f0dfe..f87d857a7f 100644 --- a/transformers/modeling_transfo_xl.py +++ b/transformers/modeling_transfo_xl.py @@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel): self.output_attentions = config.output_attentions self.output_hidden_states = config.output_hidden_states - self.n_token = config.n_token + self.n_token = config.vocab_size self.d_embed = config.d_embed self.d_model = config.d_model self.n_head = config.n_head self.d_head = config.d_head - self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, + self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val) self.drop = nn.Dropout(config.dropout) @@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel): self.sample_softmax = config.sample_softmax # use sampled softmax if config.sample_softmax > 0: - self.out_layer = nn.Linear(config.d_model, config.n_token) - self.sampler = LogUniformSampler(config.n_token, config.sample_softmax) + self.out_layer = nn.Linear(config.d_model, config.vocab_size) + self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax) # use adaptive softmax (including standard softmax) else: - self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model, + self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val) self.init_weights() diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index 225e5b059b..daed5f2857 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel): self.clamp_len = config.clamp_len self.n_layer = config.n_layer - self.word_embedding = nn.Embedding(config.n_token, config.d_model) + self.word_embedding = nn.Embedding(config.vocab_size, config.d_model) self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model)) self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)]) self.dropout = nn.Dropout(config.dropout) @@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): self.same_length = config.same_length self.transformer = XLNetModel(config) - self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True) + self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True) self.init_weights() diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py index a14d66ae8f..1911d244e7 100644 --- a/transformers/tests/modeling_albert_test.py +++ b/transformers/tests/modeling_albert_test.py @@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = AlbertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py index 539f66cd3f..0eb7bc9a14 100644 --- a/transformers/tests/modeling_bert_test.py +++ b/transformers/tests/modeling_bert_test.py @@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = BertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 80d5d95455..f86eb7a3d0 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -633,7 +633,7 @@ class CommonTestCases: mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length) config = self.config_class( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_positions=self.n_positions, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py index 8c14578a5c..c7de49b2ab 100644 --- a/transformers/tests/modeling_ctrl_test.py +++ b/transformers/tests/modeling_ctrl_test.py @@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = CTRLConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py index 4b8f64327d..82f71c40da 100644 --- a/transformers/tests/modeling_distilbert_test.py +++ b/transformers/tests/modeling_distilbert_test.py @@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = DistilBertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, dim=self.hidden_size, n_layers=self.num_hidden_layers, n_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py index ecaa2a4bd0..a82e39c261 100644 --- a/transformers/tests/modeling_gpt2_test.py +++ b/transformers/tests/modeling_gpt2_test.py @@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = GPT2Config( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py index 8e4d13438d..7655e432e8 100644 --- a/transformers/tests/modeling_openai_test.py +++ b/transformers/tests/modeling_openai_test.py @@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = OpenAIGPTConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 7a3553b164..4d34a50528 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = RobertaConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py index 7d3325b70b..93aeab66c2 100644 --- a/transformers/tests/modeling_tf_albert_test.py +++ b/transformers/tests/modeling_tf_albert_test.py @@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = AlbertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py index d7a86fecb9..20073e1ab8 100644 --- a/transformers/tests/modeling_tf_bert_test.py +++ b/transformers/tests/modeling_tf_bert_test.py @@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = BertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py index 0b421c20c9..0876582e57 100644 --- a/transformers/tests/modeling_tf_ctrl_test.py +++ b/transformers/tests/modeling_tf_ctrl_test.py @@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = CTRLConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py index 0ec45150ca..d9e971c2a5 100644 --- a/transformers/tests/modeling_tf_distilbert_test.py +++ b/transformers/tests/modeling_tf_distilbert_test.py @@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = DistilBertConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, dim=self.hidden_size, n_layers=self.num_hidden_layers, n_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py index e070b72e65..3f30b32787 100644 --- a/transformers/tests/modeling_tf_gpt2_test.py +++ b/transformers/tests/modeling_tf_gpt2_test.py @@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = GPT2Config( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py index 675e806c12..863dbf1bc0 100644 --- a/transformers/tests/modeling_tf_openai_gpt_test.py +++ b/transformers/tests/modeling_tf_openai_gpt_test.py @@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = OpenAIGPTConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_embd=self.hidden_size, n_layer=self.num_hidden_layers, n_head=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py index 42440bf1b7..f4ed97c44b 100644 --- a/transformers/tests/modeling_tf_roberta_test.py +++ b/transformers/tests/modeling_tf_roberta_test.py @@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): choice_labels = ids_tensor([self.batch_size], self.num_choices) config = RobertaConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, hidden_size=self.hidden_size, num_hidden_layers=self.num_hidden_layers, num_attention_heads=self.num_attention_heads, diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py index 03e332bdc1..553263250a 100644 --- a/transformers/tests/modeling_tf_transfo_xl_test.py +++ b/transformers/tests/modeling_tf_transfo_xl_test.py @@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = TransfoXLConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, mem_len=self.mem_len, clamp_len=self.clamp_len, cutoffs=self.cutoffs, diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py index a680b70367..228e436149 100644 --- a/transformers/tests/modeling_tf_xlm_test.py +++ b/transformers/tests/modeling_tf_xlm_test.py @@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) config = XLMConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_special=self.n_special, emb_dim=self.hidden_size, n_layers=self.num_hidden_layers, diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py index 94864b86f2..eb66d92793 100644 --- a/transformers/tests/modeling_tf_xlnet_test.py +++ b/transformers/tests/modeling_tf_xlnet_test.py @@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): num_attention_heads=4, d_inner=128, num_hidden_layers=5, - max_position_embeddings=10, type_sequence_label_size=2, untie_r=True, bi_data=False, @@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): self.num_attention_heads = num_attention_heads self.d_inner = d_inner self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings self.bi_data = bi_data self.untie_r = untie_r self.same_length = same_length @@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32) config = XLNetConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, d_model=self.hidden_size, n_head=self.num_attention_heads, d_inner=self.d_inner, n_layer=self.num_hidden_layers, untie_r=self.untie_r, - max_position_embeddings=self.max_position_embeddings, mem_len=self.mem_len, clamp_len=self.clamp_len, same_length=self.same_length, diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py index 647dd3724d..dca46444ba 100644 --- a/transformers/tests/modeling_transfo_xl_test.py +++ b/transformers/tests/modeling_transfo_xl_test.py @@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = TransfoXLConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, mem_len=self.mem_len, clamp_len=self.clamp_len, cutoffs=self.cutoffs, diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py index f6b980767c..7cae6c848e 100644 --- a/transformers/tests/modeling_xlm_test.py +++ b/transformers/tests/modeling_xlm_test.py @@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester): is_impossible_labels = ids_tensor([self.batch_size], 2).float() config = XLMConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_special=self.n_special, emb_dim=self.hidden_size, n_layers=self.num_hidden_layers, diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py index 56b6bb3f4d..6d901ee699 100644 --- a/transformers/tests/modeling_xlnet_test.py +++ b/transformers/tests/modeling_xlnet_test.py @@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): num_attention_heads=4, d_inner=128, num_hidden_layers=5, - max_position_embeddings=10, type_sequence_label_size=2, untie_r=True, bi_data=False, @@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): self.num_attention_heads = num_attention_heads self.d_inner = d_inner self.num_hidden_layers = num_hidden_layers - self.max_position_embeddings = max_position_embeddings self.bi_data = bi_data self.untie_r = untie_r self.same_length = same_length @@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) config = XLNetConfig( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, d_model=self.hidden_size, n_head=self.num_attention_heads, d_inner=self.d_inner, n_layer=self.num_hidden_layers, untie_r=self.untie_r, - max_position_embeddings=self.max_position_embeddings, mem_len=self.mem_len, clamp_len=self.clamp_len, same_length=self.same_length, From 8ade2040984c2cd3fd04bf56b133f70718254b03 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 13 Dec 2019 14:48:47 +0100 Subject: [PATCH 130/302] fix tf --- transformers/modeling_openai.py | 1 + transformers/modeling_tf_openai.py | 1 + 2 files changed, 2 insertions(+) diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py index 4fe7ffee8b..72f1224e39 100644 --- a/transformers/modeling_openai.py +++ b/transformers/modeling_openai.py @@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel): def __init__(self, config): super(OpenAIGPTDoubleHeadsModel, self).__init__(config) + config.num_labels = 1 self.transformer = OpenAIGPTModel(config) self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False) self.multiple_choice_head = SequenceSummary(config) diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py index dac3b17590..bd469f0205 100644 --- a/transformers/modeling_tf_openai.py +++ b/transformers/modeling_tf_openai.py @@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel): """ def __init__(self, config, *inputs, **kwargs): super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs) + config.num_labels = 1 self.transformer = TFOpenAIGPTMainLayer(config, name='transformer') self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head') From 5a5c4349e8a141d2c0915d71cb3cee101da0db6f Mon Sep 17 00:00:00 2001 From: Pierric Cistac Date: Fri, 13 Dec 2019 10:02:33 -0500 Subject: [PATCH 131/302] Fix summarization `to_cpu` doc --- examples/summarization/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/summarization/README.md b/examples/summarization/README.md index 96825cfa46..b98581e8e5 100644 --- a/examples/summarization/README.md +++ b/examples/summarization/README.md @@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p python run_summarization.py \ --documents_dir $DATA_PATH \ --summaries_output_dir $SUMMARIES_PATH \ # optional - --to_cpu false \ + --no_cuda false \ --batch_size 4 \ --min_length 50 \ --max_length 200 \ @@ -39,7 +39,7 @@ python run_summarization.py \ --compute_rouge true ``` -The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize). +The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize). ## Summarize any text @@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic python run_summarization.py \ --documents_dir $DATA_PATH \ --summaries_output_dir $SUMMARIES_PATH \ # optional - --to_cpu false \ + --no_cuda false \ --batch_size 4 \ --min_length 50 \ --max_length 200 \ From 0b51532ce94140cdb22f761b09fff28cce76f985 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 13 Dec 2019 16:22:50 +0100 Subject: [PATCH 132/302] Reintroducing the batch_encode_plus method --- transformers/tokenization_utils.py | 86 ++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index f4395cd82c..169caff8dc 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -878,6 +878,92 @@ class PreTrainedTokenizer(object): return_overflowing_tokens=return_overflowing_tokens, return_special_tokens_mask=return_special_tokens_mask) + def batch_encode_plus(self, + batch_text_or_text_pairs=None, + add_special_tokens=False, + max_length=None, + stride=0, + truncation_strategy='longest_first', + return_tensors=None, + return_input_lengths=False, + return_attention_masks=False, + **kwargs): + """ + Returns a dictionary containing the encoded sequence or sequence pair and additional information: + the mask for sequence classification and the overflowing elements if a ``max_length`` is specified. + + Args: + batch_text_or_text_pairs: Batch of sequences or pair of sequences to be encoded. + This can be a list of string/string-sequences/int-sequences or a list of pair of + string/string-sequences/int-sequence (see details in encode_plus) + add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative + to their model. + max_length: if set to a number, will limit the total sequence returned so that it has a maximum length. + If there are overflowing tokens, those will be added to the returned dictionary` + stride: if set to a number along with max_length, the overflowing tokens returned will contain some tokens + from the main sequence returned. The value of this argument defines the number of additional tokens. + truncation_strategy: string selected in the following options: + - 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_length + starting from the longest one at each token (when there is a pair of input sequences) + - 'only_first': Only truncate the first sequence + - 'only_second': Only truncate the second sequence + - 'do_not_truncate': Does not truncate (raise an error if the input sequence is longer than max_length) + return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant + or PyTorch torch.Tensor instead of a list of python integers. + **kwargs: passed to the `self.tokenize()` method + """ + batch_outputs = {} + for ids_or_pair_ids in batch_text_or_text_pairs: + if isinstance(ids_or_pair_ids, (list, tuple)): + assert len(ids_or_pair_ids) == 2 + ids, pair_ids = ids_or_pair_ids + else: + ids, pair_ids = ids_or_pair_ids, None + outputs = self.encode_plus(ids, pair_ids, add_special_tokens=add_special_tokens, max_length=max_length, + stride=stride, truncation_strategy=truncation_strategy, return_tensors=None) + + # Append the non-padded length to the output + if return_input_lengths: + outputs['input_len'] = len(outputs['input_ids']) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + # Compute longest sequence size + max_seq_len = max(map(len, batch_outputs['input_ids'])) + + if return_attention_masks: + # Allow the model to not give any special attention to padded input + batch_outputs['attention_mask'] = [[0] * len(v) for v in batch_outputs['input_ids']] + + if return_tensors is not None: + + # Do the tensor conversion in batch + for key, value in batch_outputs.items(): + + padded_value = value + if key != 'input_len': + # Padding handle + padded_value = [v + [self.pad_token_id if key == 'input_ids' else 1] * (max_seq_len - len(v)) for v in padded_value] + + if return_tensors == 'tf' and is_tf_available(): + batch_outputs[key] = tf.constant(padded_value) + elif return_tensors == 'pt' and is_torch_available(): + batch_outputs[key] = torch.tensor(padded_value) + elif return_tensors is not None: + logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors)) + + # encoder_attention_mask requires 1 for real token, 0 for padding, just invert value + if return_attention_masks: + if is_tf_available(): + batch_outputs['attention_mask'] = tf.abs(batch_outputs['attention_mask'] - 1) + else: + batch_outputs['attention_mask'] = torch.abs(batch_outputs['attention_mask'] - 1) + + return batch_outputs + def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0, truncation_strategy='longest_first', pad_to_max_length=False, From 5c00e344c1350e079d428a4d69cbb465ca7ffde9 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 13 Dec 2019 16:33:29 +0100 Subject: [PATCH 133/302] update model doc - swith 3B/11B to 3b/11b --- docs/source/pretrained_models.rst | 25 ++++++++++--------------- transformers/configuration_t5.py | 4 ++-- transformers/modeling_t5.py | 4 ++-- transformers/modeling_tf_t5.py | 4 ++-- transformers/tokenization_t5.py | 8 ++++---- 5 files changed, 20 insertions(+), 25 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 7e1366b53a..c6b990f213 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -217,25 +217,20 @@ Here is the full list of the currently provided pretrained models together with | | | | ALBERT xxlarge model with no dropout, additional training data and longer training | | | | (see `details `__) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| T5 | ``t5-small`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | -| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint | -| | | (see `details `__) | +| T5 | ``t5-small`` | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-base`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | -| | | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer. | -| | | (see `details `__) | +| | ``t5-base`` | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-large`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | -| | | (see `details `__) | +| | ``t5-large`` | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-3b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | -| | | (see `details `__) | +| | ``t5-3B`` | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``t5-11b`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | -| | | (see `details `__) | +| | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, | +| | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 2ccdebc2b1..6391cb4180 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -30,8 +30,8 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json", } diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index c9310179a3..263dc33b70 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -44,8 +44,8 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin", } #################################################### diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 0ae7fff412..1336a1c30d 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -34,8 +34,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5", } #################################################### diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py index 62e9c069e2..9fd37b67c0 100644 --- a/transformers/tokenization_t5.py +++ b/transformers/tokenization_t5.py @@ -44,8 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", 't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", - 't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", + 't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model", } } @@ -56,8 +56,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 't5-small': 512, 't5-base': 512, 't5-large': 512, - 't5-3B': 512, - 't5-11B': 512, + 't5-3b': 512, + 't5-11b': 512, } class T5Tokenizer(PreTrainedTokenizer): From c8ed1c82c8a42ef700d4129d227fa356385c1d60 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 13 Dec 2019 12:13:48 -0500 Subject: [PATCH 134/302] [SQUAD] Load checkpoint when evaluating without training --- examples/run_squad.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 117b86e32c..a39915ee8b 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -580,10 +580,16 @@ def main(): # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory results = {} if args.do_eval and args.local_rank in [-1, 0]: - checkpoints = [args.output_dir] - if args.eval_all_checkpoints: - checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) - logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs + + if args.do_train: + logger.info("Loading checkpoints saved during training for evaluation") + checkpoints = [args.output_dir] + if args.eval_all_checkpoints: + checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True))) + logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN) # Reduce model loading logs + else: + logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path) + checkpoints = [args.model_name_or_path] logger.info("Evaluate the following checkpoints: %s", checkpoints) From f24a228a9315a4b723509bc9144b53d2bcbc4217 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 13 Dec 2019 14:50:35 -0500 Subject: [PATCH 135/302] Speed up tokenization process --- transformers/data/processors/squad.py | 2 +- transformers/tokenization_utils.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 9bc4375684..e193f6153e 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -116,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, unique_id = 1000000000 features = [] - for (example_index, example) in enumerate(tqdm(examples)): + for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")): if is_training and not example.is_impossible: # Get start and end position start_position = example.start_position diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 317ecd167b..e87c87787b 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -637,9 +637,11 @@ class PreTrainedTokenizer(object): text: The sequence to be encoded. **kwargs: passed to the child `self.tokenize()` method """ + all_special_tokens = self.all_special_tokens + def lowercase_text(t): # convert non-special tokens to lowercase - escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens] + escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \ r'(.+?)' return re.sub( @@ -680,17 +682,17 @@ class PreTrainedTokenizer(object): tokenized_text = [] for sub_text in text_list: if sub_text not in self.added_tokens_encoder \ - and sub_text not in self.all_special_tokens: + and sub_text not in all_special_tokens: tokenized_text += split_on_token(tok, sub_text) else: tokenized_text += [sub_text] text_list = tokenized_text return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \ - in self.added_tokens_encoder and token not in self.all_special_tokens \ + in self.added_tokens_encoder and token not in all_special_tokens \ else [token] for token in tokenized_text))) - added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens + added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens tokenized_text = split_on_tokens(added_tokens, text) return tokenized_text From d46147294852694d1dc701c72b9053ff2e726265 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 13 Dec 2019 15:31:52 -0500 Subject: [PATCH 136/302] return for SQuAD [BLACKED] --- transformers/data/processors/glue.py | 2 +- transformers/data/processors/squad.py | 280 ++++++++++++++++---------- 2 files changed, 172 insertions(+), 110 deletions(-) diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py index 518251b050..11ebd949de 100644 --- a/transformers/data/processors/glue.py +++ b/transformers/data/processors/glue.py @@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer, if is_tf_available() and is_tf_dataset: def gen(): for ex in features: - yield ({'input_ids': ex.input_ids, + yield ({'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask, 'token_type_ids': ex.token_type_ids}, ex.label) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index e193f6153e..84aa429e26 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -18,19 +18,20 @@ if is_tf_available(): logger = logging.getLogger(__name__) -def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, - orig_answer_text): + +def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text): """Returns tokenized answer spans that better match the annotated answer.""" tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text)) for new_start in range(input_start, input_end + 1): for new_end in range(input_end, new_start - 1, -1): - text_span = " ".join(doc_tokens[new_start:(new_end + 1)]) + text_span = " ".join(doc_tokens[new_start : (new_end + 1)]) if text_span == tok_answer_text: return (new_start, new_end) return (input_start, input_end) + def _check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" best_score = None @@ -50,10 +51,11 @@ def _check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index + def _new_check_is_max_context(doc_spans, cur_span_index, position): """Check if this is the 'max context' doc span for the token.""" # if len(doc_spans) == 1: - # return True + # return True best_score = None best_span_index = None for (span_index, doc_span) in enumerate(doc_spans): @@ -71,14 +73,16 @@ def _new_check_is_max_context(doc_spans, cur_span_index, position): return cur_span_index == best_span_index + def _is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False -def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, - doc_stride, max_query_length, is_training, - return_dataset=False): + +def squad_convert_examples_to_features( + examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False +): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -112,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ) """ - # Defining helper methods + # Defining helper methods unique_id = 1000000000 features = [] @@ -123,13 +127,12 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, end_position = example.end_position # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) + actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)]) cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) continue - tok_to_orig_index = [] orig_to_tok_index = [] all_doc_tokens = [] @@ -140,7 +143,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, tok_to_orig_index.append(i) all_doc_tokens.append(sub_token) - if is_training and not example.is_impossible: tok_start_position = orig_to_tok_index[example.start_position] if example.end_position < len(example.doc_tokens) - 1: @@ -153,36 +155,41 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, ) spans = [] - - truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence - sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + truncated_query = tokenizer.encode( + example.question_text, add_special_tokens=False, max_length=max_query_length + ) + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair span_doc_tokens = all_doc_tokens while len(spans) * doc_stride < len(all_doc_tokens): - + encoded_dict = tokenizer.encode_plus( - truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, - span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, - max_length=max_seq_length, - return_overflowing_tokens=True, + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + max_length=max_seq_length, + return_overflowing_tokens=True, pad_to_max_length=True, stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' + truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first", ) - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + paragraph_len = min( + len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens, + ) - if tokenizer.pad_token_id in encoded_dict['input_ids']: - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + if tokenizer.pad_token_id in encoded_dict["input_ids"]: + non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)] else: - non_padded_ids = encoded_dict['input_ids'] + non_padded_ids = encoded_dict["input_ids"] tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) token_to_orig_map = {} for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] encoded_dict["paragraph_len"] = paragraph_len @@ -202,16 +209,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, for doc_span_index in range(len(spans)): for j in range(spans[doc_span_index]["paragraph_len"]): is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + index = ( + j + if tokenizer.padding_side == "left" + else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j + ) spans[doc_span_index]["token_is_max_context"][index] = is_max_context for span in spans: # Identify the position of the CLS token - cls_index = span['input_ids'].index(tokenizer.cls_token_id) + cls_index = span["input_ids"].index(tokenizer.cls_token_id) # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = np.array(span['token_type_ids']) + p_mask = np.array(span["token_type_ids"]) p_mask = np.minimum(p_mask, 1) @@ -224,7 +235,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, # Set the CLS index to '0' p_mask[cls_index] = 0 - span_is_impossible = example.is_impossible start_position = 0 end_position = 0 @@ -247,55 +257,99 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_offset = 0 else: doc_offset = len(truncated_query) + sequence_added_tokens - + start_position = tok_start_position - doc_start + doc_offset end_position = tok_end_position - doc_start + doc_offset - - features.append(SquadFeatures( - span['input_ids'], - span['attention_mask'], - span['token_type_ids'], - cls_index, - p_mask.tolist(), - - example_index=example_index, - unique_id=unique_id, - paragraph_len=span['paragraph_len'], - token_is_max_context=span["token_is_max_context"], - tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"], - - start_position=start_position, - end_position=end_position - )) + features.append( + SquadFeatures( + span["input_ids"], + span["attention_mask"], + span["token_type_ids"], + cls_index, + p_mask.tolist(), + example_index=example_index, + unique_id=unique_id, + paragraph_len=span["paragraph_len"], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + start_position=start_position, + end_position=end_position, + ) + ) unique_id += 1 - if return_dataset == 'pt': + if return_dataset == "pt": if not is_torch_available(): raise ImportError("Pytorch must be installed to return a pytorch dataset.") # Convert to Tensors and build dataset all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) + all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long) + all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long) all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) if not is_training: all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask + ) else: all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask) + dataset = TensorDataset( + all_input_ids, + all_attention_masks, + all_token_type_ids, + all_start_positions, + all_end_positions, + all_cls_index, + all_p_mask, + ) return features, dataset - + elif return_dataset == "tf": + if not is_tf_available(): + raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.") + + def gen(): + for ex in features: + yield ( + { + "input_ids": ex.input_ids, + "attention_mask": ex.attention_mask, + "token_type_ids": ex.token_type_ids, + }, { + "start_position": ex.start_position, + "end_position": ex.end_position, + "cls_index": ex.cls_index, + "p_mask": ex.p_mask, + } + ) + + return tf.data.Dataset.from_generator( + gen, + ( + {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, + {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32}, + ), + ( + { + "input_ids": tf.TensorShape([None]), + "attention_mask": tf.TensorShape([None]), + "token_type_ids": tf.TensorShape([None]), + }, + { + "start_position": tf.TensorShape([]), + "end_position": tf.TensorShape([]), + "cls_index": tf.TensorShape([]), + "p_mask": tf.TensorShape([None]), + }, + ), + ) return features @@ -305,31 +359,32 @@ class SquadProcessor(DataProcessor): Processor for the SQuAD data set. Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively. """ + train_file = None dev_file = None def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False): if not evaluate: - answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8') - answer_start = tensor_dict['answers']['answer_start'][0].numpy() + answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8") + answer_start = tensor_dict["answers"]["answer_start"][0].numpy() answers = [] else: - answers = [{ - "answer_start": start.numpy(), - "text": text.numpy().decode('utf-8') - } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])] + answers = [ + {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")} + for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"]) + ] answer = None answer_start = None return SquadExample( - qas_id=tensor_dict['id'].numpy().decode("utf-8"), - question_text=tensor_dict['question'].numpy().decode('utf-8'), - context_text=tensor_dict['context'].numpy().decode('utf-8'), + qas_id=tensor_dict["id"].numpy().decode("utf-8"), + question_text=tensor_dict["question"].numpy().decode("utf-8"), + context_text=tensor_dict["context"].numpy().decode("utf-8"), answer_text=answer, start_position_character=answer_start, - title=tensor_dict['title'].numpy().decode('utf-8'), - answers=answers + title=tensor_dict["title"].numpy().decode("utf-8"), + answers=answers, ) def get_examples_from_dataset(self, dataset, evaluate=False): @@ -359,7 +414,7 @@ class SquadProcessor(DataProcessor): examples = [] for tensor_dict in tqdm(dataset): - examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) + examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) return examples @@ -379,7 +434,9 @@ class SquadProcessor(DataProcessor): if self.train_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader: + with open( + os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "train") @@ -397,8 +454,10 @@ class SquadProcessor(DataProcessor): if self.dev_file is None: raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor") - - with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader: + + with open( + os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8" + ) as reader: input_data = json.load(reader)["data"] return self._create_examples(input_data, "dev") @@ -406,7 +465,7 @@ class SquadProcessor(DataProcessor): is_training = set_type == "train" examples = [] for entry in tqdm(input_data): - title = entry['title'] + title = entry["title"] for paragraph in entry["paragraphs"]: context_text = paragraph["context"] for qa in paragraph["qas"]: @@ -415,7 +474,7 @@ class SquadProcessor(DataProcessor): start_position_character = None answer_text = None answers = [] - + if "is_impossible" in qa: is_impossible = qa["is_impossible"] else: @@ -424,8 +483,8 @@ class SquadProcessor(DataProcessor): if not is_impossible: if is_training: answer = qa["answers"][0] - answer_text = answer['text'] - start_position_character = answer['answer_start'] + answer_text = answer["text"] + start_position_character = answer["answer_start"] else: answers = qa["answers"] @@ -437,12 +496,13 @@ class SquadProcessor(DataProcessor): start_position_character=start_position_character, title=title, is_impossible=is_impossible, - answers=answers + answers=answers, ) examples.append(example) return examples + class SquadV1Processor(SquadProcessor): train_file = "train-v1.1.json" dev_file = "dev-v1.1.json" @@ -451,7 +511,7 @@ class SquadV1Processor(SquadProcessor): class SquadV2Processor(SquadProcessor): train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" - + class SquadExample(object): """ @@ -468,21 +528,23 @@ class SquadExample(object): is_impossible: False by default, set to True if the example has no possible answer. """ - def __init__(self, - qas_id, - question_text, - context_text, - answer_text, - start_position_character, - title, - answers=[], - is_impossible=False): + def __init__( + self, + qas_id, + question_text, + context_text, + answer_text, + start_position_character, + title, + answers=[], + is_impossible=False, + ): self.qas_id = qas_id self.question_text = question_text self.context_text = context_text self.answer_text = answer_text self.title = title - self.is_impossible = is_impossible + self.is_impossible = is_impossible self.answers = answers self.start_position, self.end_position = 0, 0 @@ -537,24 +599,23 @@ class SquadFeatures(object): end_position: end of the answer token index """ - def __init__(self, - input_ids, - attention_mask, - token_type_ids, - cls_index, - p_mask, - - example_index, - unique_id, - paragraph_len, - token_is_max_context, - tokens, - token_to_orig_map, - - start_position, - end_position - ): - self.input_ids = input_ids + def __init__( + self, + input_ids, + attention_mask, + token_type_ids, + cls_index, + p_mask, + example_index, + unique_id, + paragraph_len, + token_is_max_context, + tokens, + token_to_orig_map, + start_position, + end_position, + ): + self.input_ids = input_ids self.attention_mask = attention_mask self.token_type_ids = token_type_ids self.cls_index = cls_index @@ -580,12 +641,13 @@ class SquadResult(object): start_logits: The logits corresponding to the start of the answer end_logits: The logits corresponding to the end of the answer """ + def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None): self.start_logits = start_logits self.end_logits = end_logits self.unique_id = unique_id - + if start_top_index: self.start_top_index = start_top_index self.end_top_index = end_top_index - self.cls_logits = cls_logits \ No newline at end of file + self.cls_logits = cls_logits From 866d73ca26a13d7e378b2f88f365cb0807c47805 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 13 Dec 2019 16:09:23 -0500 Subject: [PATCH 137/302] [cli] Upload is now compatible with folders --- transformers/commands/user.py | 57 ++++++++++++++++++++++++++--------- 1 file changed, 43 insertions(+), 14 deletions(-) diff --git a/transformers/commands/user.py b/transformers/commands/user.py index d79922ed8a..8e0e563422 100644 --- a/transformers/commands/user.py +++ b/transformers/commands/user.py @@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand): list_parser.set_defaults(func=lambda args: ListObjsCommand(args)) # upload upload_parser = parser.add_parser('upload') - upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.') - upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.') + upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.') + upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.') upload_parser.set_defaults(func=lambda args: UploadCommand(args)) @@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand): class UploadCommand(BaseUserCommand): + def walk_dir(self, rel_path): + """ + Recursively list all files in a folder. + """ + entries: List[os.DirEntry] = list(os.scandir(rel_path)) + files = [ + ( + os.path.join(os.getcwd(), f.path), # filepath + f.path # filename + ) + for f in entries if f.is_file() + ] + for f in entries: + if f.is_dir(): + files += self.walk_dir(f.path) + return files + def run(self): token = HfFolder.get_token() if token is None: print("Not logged in") exit(1) - filepath = os.path.join(os.getcwd(), self.args.file) - filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath) - print( - "About to upload file {} to S3 under filename {}".format( - ANSI.bold(filepath), ANSI.bold(filename) + local_path = os.path.abspath(self.args.path) + if os.path.isdir(local_path): + if self.args.filename is not None: + raise ValueError("Cannot specify a filename override when uploading a folder.") + rel_path = os.path.basename(local_path) + files = self.walk_dir(rel_path) + elif os.path.isfile(local_path): + filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path) + files = [(local_path, filename)] + else: + raise ValueError("Not a valid file or directory: {}".format(local_path)) + + for filepath, filename in files: + print( + "About to upload file {} to S3 under filename {}".format( + ANSI.bold(filepath), ANSI.bold(filename) + ) ) - ) choice = input("Proceed? [Y/n] ").lower() if not(choice == "" or choice == "y" or choice == "yes"): print("Abort") exit() print( - ANSI.bold("Uploading... This might take a while if file is large") + ANSI.bold("Uploading... This might take a while if files are large") ) - access_url = self._api.presign_and_upload( - token=token, filename=filename, filepath=filepath - ) - print("Your file now lives at:") - print(access_url) + for filepath, filename in files: + access_url = self._api.presign_and_upload( + token=token, filename=filename, filepath=filepath + ) + print("Your file now lives at:") + print(access_url) From 5b7b78e088352a3aaf1f80d26bb1cd466bc2ac64 Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Sun, 8 Dec 2019 23:22:02 +0100 Subject: [PATCH 138/302] :bug: #2096 in tokenizer.decode, adds a space after special tokens to return right formatted string --- transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index e87c87787b..42519c26ba 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -1180,7 +1180,7 @@ class PreTrainedTokenizer(object): if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) current_sub_text = [] - sub_texts.append(" " + token) + sub_texts.append(" " + token + " ") else: current_sub_text.append(token) if current_sub_text: From df160af736cba1d50c09abcf92c8fc6c00bcdb13 Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Tue, 10 Dec 2019 00:03:38 +0100 Subject: [PATCH 139/302] :bug: #2096 in tokenizer.decode, space is not joined between all subtexts instead of before added tokens --- transformers/tests/tokenization_bert_test.py | 16 ++++++++++++++++ transformers/tokenization_utils.py | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index f390248956..c47f149e9a 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -99,6 +99,21 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) + def test_encode_decode_with_spaces(self): + tokenizer = self.get_tokenizer() + + new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] + tokenizer.add_tokens(new_toks) + input = "unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]" + encoded = tokenizer.encode(input) + decoded = tokenizer.decode(encoded) + self.assertEqual( + decoded.lower(), + (f"[CLS] {input.lower()} [SEP]").lower() + ) + + + def test_is_whitespace(self): self.assertTrue(_is_whitespace(u" ")) self.assertTrue(_is_whitespace(u"\t")) @@ -139,5 +154,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): assert encoded_sentence == [101] + text + [102] assert encoded_pair == [101] + text + [102] + text_2 + [102] + if __name__ == '__main__': unittest.main() diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 42519c26ba..8aef80fec8 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -1180,12 +1180,12 @@ class PreTrainedTokenizer(object): if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) current_sub_text = [] - sub_texts.append(" " + token + " ") + sub_texts.append(token) else: current_sub_text.append(token) if current_sub_text: sub_texts.append(self.convert_tokens_to_string(current_sub_text)) - text = ''.join(sub_texts) + text = ' '.join(sub_texts) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) From dd2add9f6efdaa248f3074b865dc67c439b30a4d Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Tue, 10 Dec 2019 00:29:44 +0100 Subject: [PATCH 140/302] more tests --- transformers/tests/tokenization_bert_test.py | 2 +- transformers/tests/tokenization_gpt2_test.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index c47f149e9a..b93934dd67 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -109,7 +109,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): decoded = tokenizer.decode(encoded) self.assertEqual( decoded.lower(), - (f"[CLS] {input.lower()} [SEP]").lower() + (f"[CLS] {input} [SEP]").lower() ) diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py index a77cc75ec2..9e6ca3c4fd 100644 --- a/transformers/tests/tokenization_gpt2_test.py +++ b/transformers/tests/tokenization_gpt2_test.py @@ -67,6 +67,20 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) + def test_encode_decode_with_spaces(self): + tokenizer = self.get_tokenizer() + + new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] + tokenizer.add_tokens(new_toks) + input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower[DEF]" + encoded = tokenizer.encode(input) + decoded = tokenizer.decode(encoded) + self.assertEqual( + decoded.lower(), + input.lower() + ) + + if __name__ == '__main__': unittest.main() From 4cbdc7d910a0a12871a8e29760a3a6721a138421 Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Tue, 10 Dec 2019 09:37:15 +0100 Subject: [PATCH 141/302] missed space --- transformers/tests/tokenization_bert_test.py | 2 -- transformers/tests/tokenization_gpt2_test.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index b93934dd67..a039a24dd8 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -112,8 +112,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): (f"[CLS] {input} [SEP]").lower() ) - - def test_is_whitespace(self): self.assertTrue(_is_whitespace(u" ")) self.assertTrue(_is_whitespace(u"\t")) diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py index 9e6ca3c4fd..1b4fe42874 100644 --- a/transformers/tests/tokenization_gpt2_test.py +++ b/transformers/tests/tokenization_gpt2_test.py @@ -72,7 +72,7 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] tokenizer.add_tokens(new_toks) - input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower[DEF]" + input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower [DEF]" encoded = tokenizer.encode(input) decoded = tokenizer.decode(encoded) self.assertEqual( From f2ac50cb5560e13d941f1ea3dec3399f12f7a3fb Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Tue, 10 Dec 2019 09:58:06 +0100 Subject: [PATCH 142/302] better for python2.x --- transformers/tests/tokenization_bert_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index a039a24dd8..77b124cdf2 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -109,7 +109,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): decoded = tokenizer.decode(encoded) self.assertEqual( decoded.lower(), - (f"[CLS] {input} [SEP]").lower() + ("[CLS] " + input + " [SEP]").lower() ) def test_is_whitespace(self): From c3248cf122014dce10c0c8d0e663a95c948493e3 Mon Sep 17 00:00:00 2001 From: LysandreJik Date: Wed, 11 Dec 2019 12:36:37 -0500 Subject: [PATCH 143/302] Tests for all tokenizers --- transformers/tests/tokenization_bert_test.py | 13 ------------- transformers/tests/tokenization_gpt2_test.py | 15 --------------- transformers/tests/tokenization_tests_commons.py | 9 +++++++++ 3 files changed, 9 insertions(+), 28 deletions(-) diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py index 77b124cdf2..c503ea5e1e 100644 --- a/transformers/tests/tokenization_bert_test.py +++ b/transformers/tests/tokenization_bert_test.py @@ -99,19 +99,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual( tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"]) - def test_encode_decode_with_spaces(self): - tokenizer = self.get_tokenizer() - - new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] - tokenizer.add_tokens(new_toks) - input = "unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]" - encoded = tokenizer.encode(input) - decoded = tokenizer.decode(encoded) - self.assertEqual( - decoded.lower(), - ("[CLS] " + input + " [SEP]").lower() - ) - def test_is_whitespace(self): self.assertTrue(_is_whitespace(u" ")) self.assertTrue(_is_whitespace(u"\t")) diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py index 1b4fe42874..5eae767bdf 100644 --- a/transformers/tests/tokenization_gpt2_test.py +++ b/transformers/tests/tokenization_gpt2_test.py @@ -67,20 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester): self.assertListEqual( tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) - def test_encode_decode_with_spaces(self): - tokenizer = self.get_tokenizer() - - new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] - tokenizer.add_tokens(new_toks) - input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower [DEF]" - encoded = tokenizer.encode(input) - decoded = tokenizer.decode(encoded) - self.assertEqual( - decoded.lower(), - input.lower() - ) - - - if __name__ == '__main__': unittest.main() diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index c009958135..13e7ae746a 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -232,6 +232,15 @@ class CommonTestCases: self.assertNotEqual(len(tokens_2), 0) self.assertIsInstance(text_2, (str, unicode)) + def test_encode_decode_with_spaces(self): + tokenizer = self.get_tokenizer() + + new_toks = ['[ABC]', '[DEF]', 'GHI IHG'] + tokenizer.add_tokens(new_toks) + input = "[ABC] [DEF] [ABC] GHI IHG [DEF]" + encoded = tokenizer.encode(input, add_special_tokens=False) + decoded = tokenizer.decode(encoded) + self.assertEqual(decoded, input) def test_pretrained_model_lists(self): weights_list = list(self.tokenizer_class.max_model_input_sizes.keys()) From 7bd11dda6f43656cf0a3891b7f61a67196d233b4 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 13 Dec 2019 16:45:30 -0500 Subject: [PATCH 144/302] Release: v2.2.2 --- README.md | 2 +- docs/source/conf.py | 2 +- setup.py | 2 +- transformers/__init__.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index f3aa8a95ee..f24ceaa6d2 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers | -| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | +| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | ## Installation diff --git a/docs/source/conf.py b/docs/source/conf.py index 2f8505ab3a..99b7b44922 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ author = u'huggingface' # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'2.2.1' +release = u'2.2.2' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index c4af32df83..eacb5ecec0 100644 --- a/setup.py +++ b/setup.py @@ -44,7 +44,7 @@ extras['all'] = [package for package in extras.values()] setup( name="transformers", - version="2.2.1", + version="2.2.2", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", author_email="thomas@huggingface.co", description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", diff --git a/transformers/__init__.py b/transformers/__init__.py index 5d7b0b772c..c11919f0a7 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.2.1" +__version__ = "2.2.2" # Work around to update TensorFlow's absl.logging threshold which alters the # default Python logging output behavior when present. From 8e9526b4b56486606979f1c47d3317b0b22340fe Mon Sep 17 00:00:00 2001 From: erenup Date: Sat, 14 Dec 2019 08:43:58 +0800 Subject: [PATCH 145/302] add multiple processing --- examples/run_squad.py | 5 +- transformers/data/processors/squad.py | 342 ++++++++++++++------------ 2 files changed, 187 insertions(+), 160 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index d124d07eb5..b8883b8852 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -360,7 +360,8 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal doc_stride=args.doc_stride, max_query_length=args.max_query_length, is_training=not evaluate, - return_dataset='pt' + return_dataset='pt', + threads=args.threads, ) if args.local_rank in [-1, 0]: @@ -478,6 +479,8 @@ def main(): "See details at https://nvidia.github.io/apex/amp.html") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") + + parser.add_argument('--threads', type=int, default=1, help='multiple threads for converting example to features') args = parser.parse_args() if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir: diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 3f5fd46382..d24775996e 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -4,6 +4,9 @@ import logging import os import json import numpy as np +from multiprocessing import Pool +from multiprocessing import cpu_count +from functools import partial from ...tokenization_bert import BasicTokenizer, whitespace_tokenize from .utils import DataProcessor, InputExample, InputFeatures @@ -76,9 +79,168 @@ def _is_whitespace(c): return True return False +def squad_convert_example_to_features(example, max_seq_length, + doc_stride, max_query_length, is_training): + features = [] + if is_training and not example.is_impossible: + # Get start and end position + start_position = example.start_position + end_position = example.end_position + + # If the answer cannot be found in the text, then skip this example. + actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) + cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) + if actual_text.find(cleaned_answer_text) == -1: + logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) + return [] + + tok_to_orig_index = [] + orig_to_tok_index = [] + all_doc_tokens = [] + for (i, token) in enumerate(example.doc_tokens): + orig_to_tok_index.append(len(all_doc_tokens)) + sub_tokens = tokenizer.tokenize(token) + for sub_token in sub_tokens: + tok_to_orig_index.append(i) + all_doc_tokens.append(sub_token) + + if is_training and not example.is_impossible: + tok_start_position = orig_to_tok_index[example.start_position] + if example.end_position < len(example.doc_tokens) - 1: + tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 + else: + tok_end_position = len(all_doc_tokens) - 1 + + (tok_start_position, tok_end_position) = _improve_answer_span( + all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text + ) + + spans = [] + + truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) + sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ + if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence + sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair + + span_doc_tokens = all_doc_tokens + while len(spans) * doc_stride < len(all_doc_tokens): + + encoded_dict = tokenizer.encode_plus( + truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, + span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, + max_length=max_seq_length, + return_overflowing_tokens=True, + pad_to_max_length=True, + stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, + truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' + ) + + paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, + max_seq_length - len(truncated_query) - sequence_pair_added_tokens) + + if tokenizer.pad_token_id in encoded_dict['input_ids']: + non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] + else: + non_padded_ids = encoded_dict['input_ids'] + + tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) + + token_to_orig_map = {} + for i in range(paragraph_len): + index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i + token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] + + encoded_dict["paragraph_len"] = paragraph_len + encoded_dict["tokens"] = tokens + encoded_dict["token_to_orig_map"] = token_to_orig_map + encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens + encoded_dict["token_is_max_context"] = {} + encoded_dict["start"] = len(spans) * doc_stride + encoded_dict["length"] = paragraph_len + + spans.append(encoded_dict) + + if "overflowing_tokens" not in encoded_dict: + break + span_doc_tokens = encoded_dict["overflowing_tokens"] + + for doc_span_index in range(len(spans)): + for j in range(spans[doc_span_index]["paragraph_len"]): + is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) + index = j if tokenizer.padding_side == "left" else spans[doc_span_index][ + "truncated_query_with_special_tokens_length"] + j + spans[doc_span_index]["token_is_max_context"][index] = is_max_context + + for span in spans: + # Identify the position of the CLS token + cls_index = span['input_ids'].index(tokenizer.cls_token_id) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # Original TF implem also keep the classification token (set to 0) (not sure why...) + p_mask = np.array(span['token_type_ids']) + + p_mask = np.minimum(p_mask, 1) + + if tokenizer.padding_side == "right": + # Limit positive values to one + p_mask = 1 - p_mask + + p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 + + # Set the CLS index to '0' + p_mask[cls_index] = 0 + + span_is_impossible = example.is_impossible + start_position = 0 + end_position = 0 + if is_training and not span_is_impossible: + # For training, if our document chunk does not contain an annotation + # we throw it out, since there is nothing to predict. + doc_start = span["start"] + doc_end = span["start"] + span["length"] - 1 + out_of_span = False + + if not (tok_start_position >= doc_start and tok_end_position <= doc_end): + out_of_span = True + + if out_of_span: + start_position = cls_index + end_position = cls_index + span_is_impossible = True + else: + if tokenizer.padding_side == "left": + doc_offset = 0 + else: + doc_offset = len(truncated_query) + sequence_added_tokens + + start_position = tok_start_position - doc_start + doc_offset + end_position = tok_end_position - doc_start + doc_offset + + features.append(SquadFeatures( + span['input_ids'], + span['attention_mask'], + span['token_type_ids'], + cls_index, + p_mask.tolist(), + example_index=0, + unique_id=0, + paragraph_len=span['paragraph_len'], + token_is_max_context=span["token_is_max_context"], + tokens=span["tokens"], + token_to_orig_map=span["token_to_orig_map"], + + start_position=start_position, + end_position=end_position + )) + return features + +def squad_convert_example_to_features_init(tokenizer_for_convert): + global tokenizer + tokenizer = tokenizer_for_convert + def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, - return_dataset=False): + return_dataset=False, threads=1): """ Converts a list of examples into a list of features that can be directly given as input to a model. It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs. @@ -93,6 +255,8 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, return_dataset: Default False. Either 'pt' or 'tf'. if 'pt': returns a torch.data.TensorDataset, if 'tf': returns a tf.data.Dataset + threads: multiple processing threadsa-smi + Returns: list of :class:`~transformers.data.processors.squad.SquadFeatures` @@ -113,165 +277,26 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, """ # Defining helper methods - unique_id = 1000000000 - features = [] - for (example_index, example) in enumerate(tqdm(examples)): - if is_training and not example.is_impossible: - # Get start and end position - start_position = example.start_position - end_position = example.end_position - - # If the answer cannot be found in the text, then skip this example. - actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)]) - cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text)) - if actual_text.find(cleaned_answer_text) == -1: - logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text) - continue - - - tok_to_orig_index = [] - orig_to_tok_index = [] - all_doc_tokens = [] - for (i, token) in enumerate(example.doc_tokens): - orig_to_tok_index.append(len(all_doc_tokens)) - sub_tokens = tokenizer.tokenize(token) - for sub_token in sub_tokens: - tok_to_orig_index.append(i) - all_doc_tokens.append(sub_token) - - if is_training and not example.is_impossible: - tok_start_position = orig_to_tok_index[example.start_position] - if example.end_position < len(example.doc_tokens) - 1: - tok_end_position = orig_to_tok_index[example.end_position + 1] - 1 - else: - tok_end_position = len(all_doc_tokens) - 1 - - (tok_start_position, tok_end_position) = _improve_answer_span( - all_doc_tokens, tok_start_position, tok_end_position, tokenizer, example.answer_text - ) - - spans = [] - - truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length) - sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence + 1 \ - if 'roberta' in str(type(tokenizer)) else tokenizer.max_len - tokenizer.max_len_single_sentence - sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair - - span_doc_tokens = all_doc_tokens - while len(spans) * doc_stride < len(all_doc_tokens): - - encoded_dict = tokenizer.encode_plus( - truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, - span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, - max_length=max_seq_length, - return_overflowing_tokens=True, - pad_to_max_length=True, - stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens, - truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first' - ) - - paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens) - - if tokenizer.pad_token_id in encoded_dict['input_ids']: - non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)] - else: - non_padded_ids = encoded_dict['input_ids'] - - tokens = tokenizer.convert_ids_to_tokens(non_padded_ids) - - token_to_orig_map = {} - for i in range(paragraph_len): - index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i - token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i] - - encoded_dict["paragraph_len"] = paragraph_len - encoded_dict["tokens"] = tokens - encoded_dict["token_to_orig_map"] = token_to_orig_map - encoded_dict["truncated_query_with_special_tokens_length"] = len(truncated_query) + sequence_added_tokens - encoded_dict["token_is_max_context"] = {} - encoded_dict["start"] = len(spans) * doc_stride - encoded_dict["length"] = paragraph_len - - spans.append(encoded_dict) - - if "overflowing_tokens" not in encoded_dict: - break - span_doc_tokens = encoded_dict["overflowing_tokens"] - - for doc_span_index in range(len(spans)): - for j in range(spans[doc_span_index]["paragraph_len"]): - is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j) - index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j - spans[doc_span_index]["token_is_max_context"][index] = is_max_context - - for span in spans: - # Identify the position of the CLS token - cls_index = span['input_ids'].index(tokenizer.cls_token_id) - - # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) - # Original TF implem also keep the classification token (set to 0) (not sure why...) - p_mask = np.array(span['token_type_ids']) - - p_mask = np.minimum(p_mask, 1) - - if tokenizer.padding_side == "right": - # Limit positive values to one - p_mask = 1 - p_mask - - p_mask[np.where(np.array(span["input_ids"]) == tokenizer.sep_token_id)[0]] = 1 - - # Set the CLS index to '0' - p_mask[cls_index] = 0 - - - span_is_impossible = example.is_impossible - start_position = 0 - end_position = 0 - if is_training and not span_is_impossible: - # For training, if our document chunk does not contain an annotation - # we throw it out, since there is nothing to predict. - doc_start = span["start"] - doc_end = span["start"] + span["length"] - 1 - out_of_span = False - - if not (tok_start_position >= doc_start and tok_end_position <= doc_end): - out_of_span = True - - if out_of_span: - start_position = cls_index - end_position = cls_index - span_is_impossible = True - else: - if tokenizer.padding_side == "left": - doc_offset = 0 - else: - doc_offset = len(truncated_query) + sequence_added_tokens - - start_position = tok_start_position - doc_start + doc_offset - end_position = tok_end_position - doc_start + doc_offset - - - features.append(SquadFeatures( - span['input_ids'], - span['attention_mask'], - span['token_type_ids'], - cls_index, - p_mask.tolist(), - - example_index=example_index, - unique_id=unique_id, - paragraph_len=span['paragraph_len'], - token_is_max_context=span["token_is_max_context"], - tokens=span["tokens"], - token_to_orig_map=span["token_to_orig_map"], - - start_position=start_position, - end_position=end_position - )) - + threads = min(threads, cpu_count()) + with Pool(threads, initializer=squad_convert_example_to_features_init, initargs=(tokenizer,)) as p: + annotate_ = partial(squad_convert_example_to_features, max_seq_length=max_seq_length, + doc_stride=doc_stride, max_query_length=max_query_length, is_training=is_training) + features = list(tqdm(p.imap(annotate_, examples, chunksize=32), total=len(examples), desc='convert squad examples to features')) + new_features = [] + unique_id = 1000000000 + example_index = 0 + for example_features in tqdm(features, total=len(features), desc='add example index and unique id'): + if not example_features: + continue + for example_feature in example_features: + example_feature.example_index = example_index + example_feature.unique_id = unique_id + new_features.append(example_feature) unique_id += 1 - + example_index += 1 + features = new_features + del new_features if return_dataset == 'pt': if not is_torch_available(): raise ImportError("Pytorch must be installed to return a pytorch dataset.") @@ -295,7 +320,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length, all_cls_index, all_p_mask) return features, dataset - return features From a1faaf99625cb05820e7cc6b958f2aa3540778e3 Mon Sep 17 00:00:00 2001 From: erenup Date: Sat, 14 Dec 2019 08:57:13 +0800 Subject: [PATCH 146/302] deleted useless file --- srl_label.txt | 130 -------------------------------------------------- 1 file changed, 130 deletions(-) delete mode 100644 srl_label.txt diff --git a/srl_label.txt b/srl_label.txt deleted file mode 100644 index b6a7f3ad95..0000000000 --- a/srl_label.txt +++ /dev/null @@ -1,130 +0,0 @@ -O -I-ARG1 -I-ARG2 -I-ARG0 -B-V -B-ARG1 -B-ARG0 -I-ARGM-ADV -I-ARGM-TMP -B-ARG2 -I-ARGM-LOC -I-ARGM-MNR -B-ARGM-TMP -I-ARGM-CAU -I-ARGM-PRP -B-ARGM-MOD -I-C-ARG1 -B-ARGM-ADV -I-ARGM-PRD -B-ARGM-DIS -I-ARG3 -I-V -I-ARG4 -B-ARGM-MNR -B-ARGM-LOC -I-ARGM-NEG -B-ARGM-NEG -B-R-ARG0 -I-ARGM-DIR -I-ARGM-DIS -I-ARGM-PNC -I-ARGM-ADJ -B-R-ARG1 -B-ARG3 -B-ARGM-PRP -B-ARG4 -I-ARGM-GOL -I-R-ARG0 -B-ARGM-CAU -B-ARGM-DIR -B-ARGM-PRD -I-ARGM-EXT -B-C-ARG1 -B-ARGM-ADJ -I-C-ARG0 -B-ARGM-EXT -I-C-ARG2 -I-ARGM-COM -I-R-ARG1 -I-ARGM-MOD -B-ARGM-GOL -B-ARGM-PNC -B-R-ARGM-LOC -B-R-ARGM-TMP -B-ARGM-LVB -B-ARGM-COM -B-R-ARG2 -I-C-ARGM-MNR -B-C-ARG0 -I-R-ARGM-LOC -B-C-ARG2 -I-C-ARGM-EXT -I-C-ARG4 -B-ARGM-REC -I-R-ARG2 -I-C-ARGM-TMP -I-ARG5 -I-C-ARG3 -I-C-ARGM-ADV -B-ARG5 -B-R-ARGM-MNR -I-ARGM-DSP -I-C-ARGM-LOC -B-R-ARG3 -I-ARGA -I-R-ARGM-MNR -B-R-ARGM-CAU -I-R-ARGM-TMP -B-C-ARGM-MNR -B-ARGA -I-C-ARGM-DSP -B-C-ARGM-ADV -I-R-ARG3 -B-R-ARGM-ADV -B-C-ARG4 -I-C-ARGM-CAU -B-C-ARGM-EXT -B-C-ARGM-TMP -B-R-ARGM-DIR -B-R-ARG4 -I-R-ARGM-ADV -I-ARGM-REC -B-C-ARG3 -B-C-ARGM-LOC -B-R-ARGM-EXT -B-ARGM-PRR -B-R-ARGM-PRP -B-ARGM-PRX -I-R-ARGM-DIR -I-R-ARGM-EXT -I-C-ARGM-NEG -B-ARGM-DSP -B-R-ARGM-GOL -I-R-ARGM-GOL -I-R-ARGM-PNC -I-C-ARGM-PRP -B-R-ARGM-COM -I-R-ARGM-PRP -I-C-ARGM-COM -B-C-ARGM-CAU -B-C-ARGM-DSP -I-R-ARGM-COM -I-R-ARGM-CAU -B-R-ARGM-PNC -I-C-ARGM-DIS -I-C-ARGM-DIR -I-R-ARG4 -B-R-ARGM-PRD -I-R-ARGM-PRD -B-C-ARGM-PRP -B-R-ARG5 -B-C-ARGM-MOD -I-C-ARGM-MOD -B-C-ARGM-ADJ -I-C-ARGM-ADJ -B-C-ARGM-DIS -B-C-ARGM-NEG -B-C-ARGM-COM -B-C-ARGM-DIR -B-R-ARGM-MOD From b6d4284b26c0ab5e736cb7838b27b720225feeb7 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 13 Dec 2019 22:43:15 -0500 Subject: [PATCH 147/302] [cli] Uploads: fix + test edge case --- transformers/hf_api.py | 3 +- transformers/tests/fixtures/empty.txt | 0 transformers/tests/hf_api_test.py | 44 +++++++++++++++++++-------- 3 files changed, 33 insertions(+), 14 deletions(-) create mode 100644 transformers/tests/fixtures/empty.txt diff --git a/transformers/hf_api.py b/transformers/hf_api.py index 3bbb6c567a..170732339a 100644 --- a/transformers/hf_api.py +++ b/transformers/hf_api.py @@ -131,8 +131,9 @@ class HfApi: # the client still has to specify it when uploading the file. with open(filepath, "rb") as f: pf = TqdmProgressFileReader(f) + data = f if pf.total_size > 0 else "" - r = requests.put(urls.write, data=f, headers={ + r = requests.put(urls.write, data=data, headers={ "content-type": urls.type, }) r.raise_for_status() diff --git a/transformers/tests/fixtures/empty.txt b/transformers/tests/fixtures/empty.txt new file mode 100644 index 0000000000..e69de29bb2 diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py index 92d41b6dff..b45f5aceed 100644 --- a/transformers/tests/hf_api_test.py +++ b/transformers/tests/hf_api_test.py @@ -15,18 +15,30 @@ from __future__ import absolute_import, division, print_function import os -import six import time import unittest -from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError +import requests +import six + +from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj USER = "__DUMMY_TRANSFORMERS_USER__" PASS = "__DUMMY_TRANSFORMERS_PASS__" -FILE_KEY = "Test-{}.txt".format(int(time.time())) -FILE_PATH = os.path.join( - os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt" -) +FILES = [ + ( + "Test-{}.txt".format(int(time.time())), + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt" + ) + ), + ( + "yoyo {}.txt".format(int(time.time())), # space is intentional + os.path.join( + os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt" + ) + ), +] @@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest): self.assertEqual(user, USER) def test_presign(self): - urls = self._api.presign(token=self._token, filename=FILE_KEY) - self.assertIsInstance(urls, PresignedUrl) - self.assertEqual(urls.type, "text/plain") + for FILE_KEY, FILE_PATH in FILES: + urls = self._api.presign(token=self._token, filename=FILE_KEY) + self.assertIsInstance(urls, PresignedUrl) + self.assertEqual(urls.type, "text/plain") def test_presign_and_upload(self): - access_url = self._api.presign_and_upload( - token=self._token, filename=FILE_KEY, filepath=FILE_PATH - ) - self.assertIsInstance(access_url, six.string_types) + for FILE_KEY, FILE_PATH in FILES: + access_url = self._api.presign_and_upload( + token=self._token, filename=FILE_KEY, filepath=FILE_PATH + ) + self.assertIsInstance(access_url, six.string_types) + with open(FILE_PATH, 'r') as f: + body = f.read() + r = requests.get(access_url) + self.assertEqual(r.text, body) def test_list_objs(self): objs = self._api.list_objs(token=self._token) From cbb368ca06998e5d98684bc622e1d8c68ba1d88f Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 14 Dec 2019 09:31:18 +0100 Subject: [PATCH 148/302] distilbert tests --- transformers/tests/modeling_common_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index cd4cf247a6..8920e8b826 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -96,9 +96,7 @@ class CommonTestCases: # Make sure we don't have nans out_1 = after_outputs[0].cpu().numpy() - out_2 = outputs[0].cpu().numpy() - out_1 = out_1[~np.isnan(out_1)] - out_2 = out_2[~np.isnan(out_2)] + out_1[np.isnan(out_1)] = 0 max_diff = np.amax(np.abs(out_1 - out_2)) self.assertLessEqual(max_diff, 1e-5) From 7140363e092fecf82b73edd423bed3376ec1e150 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 14 Dec 2019 09:44:53 +0100 Subject: [PATCH 149/302] update bertabs --- .../summarization/configuration_bertabs.py | 48 ++++++------------- 1 file changed, 14 insertions(+), 34 deletions(-) diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py index 054763ea93..b862d58d2b 100644 --- a/examples/summarization/configuration_bertabs.py +++ b/examples/summarization/configuration_bertabs.py @@ -33,6 +33,8 @@ class BertAbsConfig(PretrainedConfig): r""" Class to store the configuration of the BertAbs model. Arguments: + vocab_size: int + Number of tokens in the vocabulary. max_pos: int The maximum sequence length that this model will be used with. enc_layer: int @@ -81,39 +83,17 @@ class BertAbsConfig(PretrainedConfig): ): super(BertAbsConfig, self).__init__(**kwargs) - if self._input_is_path_to_json(vocab_size): - path_to_json = vocab_size - with open(path_to_json, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif isinstance(vocab_size, int): - self.vocab_size = vocab_size - self.max_pos = max_pos + self.vocab_size = vocab_size + self.max_pos = max_pos - self.enc_layers = enc_layers - self.enc_hidden_size = enc_hidden_size - self.enc_heads = enc_heads - self.enc_ff_size = enc_ff_size - self.enc_dropout = enc_dropout + self.enc_layers = enc_layers + self.enc_hidden_size = enc_hidden_size + self.enc_heads = enc_heads + self.enc_ff_size = enc_ff_size + self.enc_dropout = enc_dropout - self.dec_layers = dec_layers - self.dec_hidden_size = dec_hidden_size - self.dec_heads = dec_heads - self.dec_ff_size = dec_ff_size - self.dec_dropout = dec_dropout - else: - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) - - def _input_is_path_to_json(self, first_argument): - """ Checks whether the first argument passed to config - is the path to a JSON file that contains the config. - """ - is_python_2 = sys.version_info[0] == 2 - if is_python_2: - return isinstance(first_argument, unicode) - else: - return isinstance(first_argument, str) + self.dec_layers = dec_layers + self.dec_hidden_size = dec_hidden_size + self.dec_heads = dec_heads + self.dec_ff_size = dec_ff_size + self.dec_dropout = dec_dropout From cc0135134be671b258b558f3e93ea953f2cfaa48 Mon Sep 17 00:00:00 2001 From: Pascal Voitot Date: Sat, 14 Dec 2019 15:21:56 +0100 Subject: [PATCH 150/302] :zip: #2106 basic tokenizer.tokenize global speed improvement (3-8x) by simply caching added_tokens in a Set --- transformers/tokenization_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 8aef80fec8..83a1588b2a 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -231,6 +231,7 @@ class PreTrainedTokenizer(object): # Added tokens self.added_tokens_encoder = {} + self.unique_added_tokens_encoder = set() self.added_tokens_decoder = {} # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) @@ -550,6 +551,7 @@ class PreTrainedTokenizer(object): added_tok_encoder = dict((tok, len(self) + i) for i, tok in enumerate(to_add_tokens)) added_tok_decoder = {v:k for k, v in added_tok_encoder.items()} self.added_tokens_encoder.update(added_tok_encoder) + self.unique_added_tokens_encoder = set(self.added_tokens_encoder.keys()).union(set(self.all_special_tokens)) self.added_tokens_decoder.update(added_tok_decoder) return len(to_add_tokens) @@ -627,6 +629,7 @@ class PreTrainedTokenizer(object): return added_tokens + def tokenize(self, text, **kwargs): """ Converts a string in a sequence of tokens (string), using the tokenizer. Split in words for word-based vocabulary or sub-words for sub-word-based @@ -681,18 +684,17 @@ class PreTrainedTokenizer(object): for tok in tok_list: tokenized_text = [] for sub_text in text_list: - if sub_text not in self.added_tokens_encoder \ - and sub_text not in all_special_tokens: + if sub_text not in self.unique_added_tokens_encoder: tokenized_text += split_on_token(tok, sub_text) else: tokenized_text += [sub_text] text_list = tokenized_text - return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \ - in self.added_tokens_encoder and token not in all_special_tokens \ + return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) \ + if token not in self.unique_added_tokens_encoder else [token] for token in tokenized_text))) - added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens + added_tokens = self.unique_added_tokens_encoder tokenized_text = split_on_tokens(added_tokens, text) return tokenized_text From f1971bf303cc600cc47161f137e1b0baccd62925 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 15 Dec 2019 01:37:16 +0100 Subject: [PATCH 151/302] Binding pipelines to the cli. --- transformers-cli | 4 +- transformers/commands/run.py | 56 +++++++++++++++++++ transformers/pipelines.py | 102 +++++++++++++++++++++++++++++++---- 3 files changed, 152 insertions(+), 10 deletions(-) create mode 100644 transformers/commands/run.py diff --git a/transformers-cli b/transformers-cli index 168e6e6f32..39b7f5816b 100755 --- a/transformers-cli +++ b/transformers-cli @@ -2,6 +2,7 @@ from argparse import ArgumentParser from transformers.commands.download import DownloadCommand +from transformers.commands.run import RunCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands from transformers.commands.train import TrainCommand @@ -14,9 +15,10 @@ if __name__ == '__main__': # Register commands ConvertCommand.register_subcommand(commands_parser) DownloadCommand.register_subcommand(commands_parser) + RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) - UserCommands.register_subcommand(commands_parser) TrainCommand.register_subcommand(commands_parser) + UserCommands.register_subcommand(commands_parser) # Let's go args = parser.parse_args() diff --git a/transformers/commands/run.py b/transformers/commands/run.py new file mode 100644 index 0000000000..bcbb87391d --- /dev/null +++ b/transformers/commands/run.py @@ -0,0 +1,56 @@ +from argparse import ArgumentParser + +from transformers.commands import BaseTransformersCLICommand +from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS + + +def try_infer_format_from_ext(path: str): + for ext in PipelineDataFormat.SUPPORTED_FORMATS: + if path.endswith(ext): + return ext + + raise Exception( + 'Unable to determine file format from file extension {}. ' + 'Please provide the format through --format {}'.format(path, PipelineDataFormat.SUPPORTED_FORMATS) + ) + + +def run_command_factory(args): + nlp = pipeline(task=args.task, model=args.model, tokenizer=args.tokenizer) + format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format + reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column) + return RunCommand(nlp, reader) + + +class RunCommand(BaseTransformersCLICommand): + + def __init__(self, nlp: Pipeline, reader: PipelineDataFormat): + self._nlp = nlp + self._reader = reader + + @staticmethod + def register_subcommand(parser: ArgumentParser): + run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") + run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') + run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') + run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') + run_parser.add_argument('--column', type=str, required=True, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') + run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') + run_parser.add_argument('--input', type=str, required=True, help='Path to the file to use for inference') + run_parser.add_argument('--output', type=str, required=True, help='Path to the file that will be used post to write results.') + run_parser.add_argument('kwargs', nargs='*', help='Arguments to forward to the file format reader') + run_parser.set_defaults(func=run_command_factory) + + def run(self): + nlp, output = self._nlp, [] + for entry in self._reader: + if self._reader.is_multi_columns: + output += [nlp(**entry)] + else: + output += [nlp(entry)] + + # Saving data + self._reader.save(output) + + + diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 6fbb7e2f04..a5718b822f 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -14,6 +14,8 @@ # limitations under the License. from __future__ import absolute_import, division, print_function, unicode_literals +import csv +import json import os from abc import ABC, abstractmethod from itertools import groupby @@ -25,11 +27,13 @@ from transformers import AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger if is_tf_available(): - from transformers import TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification + from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \ + TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification if is_torch_available(): import torch - from transformers import AutoModelForSequenceClassification, AutoModelForQuestionAnswering, AutoModelForTokenClassification + from transformers import AutoModel, AutoModelForSequenceClassification, \ + AutoModelForQuestionAnswering, AutoModelForTokenClassification class Pipeline(ABC): @@ -58,6 +62,84 @@ class Pipeline(ABC): raise NotImplementedError() +class PipelineDataFormat: + SUPPORTED_FORMATS = ['json', 'csv'] + + def __init__(self, output: str, path: str, column: str): + self.output = output + self.path = path + self.column = column.split(',') + self.is_multi_columns = len(self.column) > 1 + + if self.is_multi_columns: + self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] + + from os.path import abspath, exists + if exists(abspath(self.output)): + raise OSError('{} already exists on disk'.format(self.output)) + + if not exists(abspath(self.path)): + raise OSError('{} doesnt exist on disk'.format(self.path)) + + @abstractmethod + def __iter__(self): + raise NotImplementedError() + + @abstractmethod + def save(self, data: dict): + raise NotImplementedError() + + @staticmethod + def from_str(name: str, output: str, path: str, column: str): + if name == 'json': + return JsonPipelineDataFormat(output, path, column) + elif name == 'csv': + return CsvPipelineDataFormat(output, path, column) + else: + raise KeyError('Unknown reader {} (Available reader are json/csv)'.format(name)) + + +class CsvPipelineDataFormat(PipelineDataFormat): + def __init__(self, output: str, path: str, column: str): + super().__init__(output, path, column) + + def __iter__(self): + with open(self.path, 'r') as f: + reader = csv.DictReader(f) + for row in reader: + if self.is_multi_columns: + yield {k: row[c] for k, c in self.column} + else: + yield row[self.column] + + def save(self, data: List[dict]): + with open(self.output, 'w') as f: + if len(data) > 0: + writer = csv.DictWriter(f, list(data[0].keys())) + writer.writeheader() + writer.writerows(data) + + +class JsonPipelineDataFormat(PipelineDataFormat): + + def __init__(self, output: str, path: str, column: str): + super().__init__(output, path, column) + + with open(path, 'r') as f: + self._entries = json.load(f) + + def __iter__(self): + for entry in self._entries: + if self.is_multi_columns: + yield {k: entry[c] for k, c in self.column} + else: + yield entry[self.column] + + def save(self, data: dict): + with open(self.output, 'w') as f: + json.dump(data, f) + + class FeatureExtractionPipeline(Pipeline): def __call__(self, *texts, **kwargs): @@ -127,7 +209,7 @@ class NerPipeline(Pipeline): label_idx = score.argmax() answer += [{ - 'word': words[idx - 1], 'score': score[label_idx], 'entity': self.model.config.id2label[label_idx] + 'word': words[idx - 1], 'score': score[label_idx].item(), 'entity': self.model.config.id2label[label_idx] }] # Update token start @@ -270,16 +352,18 @@ class QuestionAnsweringPipeline(Pipeline): char_to_word = np.array(example.char_to_word_offset) # Convert the answer (tokens) back to the original text - answers += [[ + answers += [ { - 'score': score, - 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0], - 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1], + 'score': score.item(), + 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), + 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) } for s, e, score in zip(starts, ends, scores) - ]] + ] + if len(answers) == 1: + return answers[0] return answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: @@ -363,7 +447,7 @@ def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokeni Utility factory method to build pipeline. """ # Try to infer tokenizer from model name (if provided as str) - if not isinstance(tokenizer, PreTrainedTokenizer): + if tokenizer is None: if not isinstance(model, str): # Impossible to guest what is the right tokenizer here raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') From 8e3b1c860fbd872cb39d5093a1e739a67e7d0809 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Sun, 15 Dec 2019 01:37:52 +0100 Subject: [PATCH 152/302] Added FeatureExtraction pipeline. --- transformers/pipelines.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index a5718b822f..7383222c1f 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -143,7 +143,23 @@ class JsonPipelineDataFormat(PipelineDataFormat): class FeatureExtractionPipeline(Pipeline): def __call__(self, *texts, **kwargs): - pass + # Generic compatibility with sklearn and Keras + if 'X' in kwargs and not texts: + texts = kwargs.pop('X') + + inputs = self.tokenizer.batch_encode_plus( + texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) + + if is_tf_available(): + # TODO trace model + predictions = self.model(inputs)[0] + else: + import torch + with torch.no_grad(): + predictions = self.model(**inputs)[0] + + return predictions.numpy().tolist() class TextClassificationPipeline(Pipeline): @@ -424,6 +440,11 @@ class QuestionAnsweringPipeline(Pipeline): # Register all the supported task here SUPPORTED_TASKS = { + 'feature-extraction': { + 'impl': FeatureExtractionPipeline, + 'tf': TFAutoModel if is_tf_available() else None, + 'pt': AutoModel if is_torch_available() else None, + }, 'text-classification': { 'impl': TextClassificationPipeline, 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, From 1b8613acb32a568db8d9b74ee182d43c4f8e9cbb Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 09:51:42 +0100 Subject: [PATCH 153/302] updating t5 config class --- transformers/configuration_t5.py | 15 ++------------- transformers/tests/modeling_t5_test.py | 2 +- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py index 6391cb4180..377a0919d9 100644 --- a/transformers/configuration_t5.py +++ b/transformers/configuration_t5.py @@ -66,7 +66,7 @@ class T5Config(PretrainedConfig): pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP def __init__(self, - vocab_size_or_config_json_file=32128, + vocab_size=32128, n_positions=512, d_model=512, d_kv=64, @@ -79,7 +79,7 @@ class T5Config(PretrainedConfig): initializer_factor=1.0, **kwargs): super(T5Config, self).__init__(**kwargs) - self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1 + self.vocab_size = vocab_size self.n_positions = n_positions self.d_model = d_model self.d_kv = d_kv @@ -91,17 +91,6 @@ class T5Config(PretrainedConfig): self.layer_norm_epsilon = layer_norm_epsilon self.initializer_factor = initializer_factor - if isinstance(vocab_size_or_config_json_file, six.string_types): - with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader: - json_config = json.loads(reader.read()) - for key, value in json_config.items(): - self.__dict__[key] = value - elif not isinstance(vocab_size_or_config_json_file, int): - raise ValueError( - "First argument must be either a vocabulary size (int)" - "or the path to a pretrained model config file (str)" - ) - @property def max_position_embeddings(self): return self.n_positions diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index a539cc868a..c337163375 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -93,7 +93,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester): decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size) config = T5Config( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_positions=self.n_positions, d_model=self.hidden_size, d_ff=self.d_ff, From 8669598abd7af877bd33890d62ae70ec1623f145 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 09:59:36 +0100 Subject: [PATCH 154/302] update t5 tf --- transformers/tests/modeling_tf_t5_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py index 99eec313f9..b905a9875b 100644 --- a/transformers/tests/modeling_tf_t5_test.py +++ b/transformers/tests/modeling_tf_t5_test.py @@ -87,7 +87,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) config = T5Config( - vocab_size_or_config_json_file=self.vocab_size, + vocab_size=self.vocab_size, n_positions=self.n_positions, d_model=self.hidden_size, d_ff=self.d_ff, From 56e98ba81a9a7410243a1117fb6148d5f353ef98 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 11:07:27 +0100 Subject: [PATCH 155/302] add model cards cc @mfuntowicz --- transformers/__init__.py | 3 + transformers/file_utils.py | 2 +- transformers/model_card.py | 248 ++++++++++++++++++++++++++ transformers/tests/model_card_test.py | 87 +++++++++ 4 files changed, 339 insertions(+), 1 deletion(-) create mode 100644 transformers/model_card.py create mode 100644 transformers/tests/model_card_test.py diff --git a/transformers/__init__.py b/transformers/__init__.py index 740d2440c2..15c167a5ce 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -33,6 +33,9 @@ from .data import (is_sklearn_available, if is_sklearn_available(): from .data import glue_compute_metrics, xnli_compute_metrics +# Model Cards +from .model_card import ModelCard + # Tokenizers from .tokenization_utils import (PreTrainedTokenizer) from .tokenization_auto import AutoTokenizer diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 03b2fdb9f4..81c9b8002f 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -72,7 +72,7 @@ WEIGHTS_NAME = "pytorch_model.bin" TF2_WEIGHTS_NAME = 'tf_model.h5' TF_WEIGHTS_NAME = 'model.ckpt' CONFIG_NAME = "config.json" - +MODEL_CARD_NAME = "model_card.json" DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] diff --git a/transformers/model_card.py b/transformers/model_card.py new file mode 100644 index 0000000000..679c24872a --- /dev/null +++ b/transformers/model_card.py @@ -0,0 +1,248 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Configuration base class and utilities.""" + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import copy +import json +import logging +import os +import re +from io import open + +from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP + +from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url + + +logger = logging.getLogger(__name__) + + +ALL_MODELS_MAP = dict((key, value) + for pretrained_map in [ + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, + TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, + GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, + CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + T5_PRETRAINED_CONFIG_ARCHIVE_MAP, + ] + for key, value, in pretrained_map.items()) + + +class ModelCard(object): + r""" Model Card class. + Store model card as well as methods for loading/downloading/saving model cards. + + Please read the following paper for details and explanation on the sections: + "Model Cards for Model Reporting" + by Margaret Mitchell, Simone Wu, + Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer, + Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards. + Link: https://arxiv.org/abs/1810.03993 + + Note: + A model card can be loaded and saved to disk. + + Parameters: + """ + def __init__(self, **kwargs): + # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers) + self.model_details = kwargs.pop('model_details', {}) + self.intended_use = kwargs.pop('intended_use', {}) + self.factors = kwargs.pop('factors', {}) + self.metrics = kwargs.pop('metrics', {}) + self.evaluation_data = kwargs.pop('evaluation_data', {}) + self.training_data = kwargs.pop('training_data', {}) + self.quantitative_analyses = kwargs.pop('quantitative_analyses', {}) + self.ethical_considerations = kwargs.pop('ethical_considerations', {}) + self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {}) + + # Open additional attributes + for key, value in kwargs.items(): + try: + setattr(self, key, value) + except AttributeError as err: + logger.error("Can't set {} with value {} for {}".format(key, value, self)) + raise err + + def save_pretrained(self, save_directory): + """ Save a model card object to the directory `save_directory`, so that it + can be re-loaded using the :func:`~transformers.ModelCard.from_pretrained` class method. + """ + assert os.path.isdir(save_directory), "Saving path should be a directory where the model card can be saved" + + # If we save using the predefined names, we can load using `from_pretrained` + output_model_card_file = os.path.join(save_directory, MODEL_CARD_NAME) + + self.to_json_file(output_model_card_file) + logger.info("Model card saved in {}".format(output_model_card_file)) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card. + + Parameters: + pretrained_model_name_or_path: either: + + - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``. + - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. + - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``. + - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``. + + cache_dir: (`optional`) string: + Path to a directory in which a downloaded pre-trained model + card should be cached if the standard cache should not be used. + + kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading. + + - The values in kwargs of any keys which are model card attributes will be used to override the loaded values. + - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter. + + force_download: (`optional`) boolean, default False: + Force to (re-)download the model card file and override the cached version if it exists. + + resume_download: (`optional`) boolean, default False: + Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. + + proxies: (`optional`) dict, default None: + A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. + The proxies are used on each request. + + return_unused_kwargs: (`optional`) bool: + + - If False, then this function returns just the final model card object. + - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored. + + Examples:: + + model_card = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. + model_card = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` + model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json') + model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + + """ + cache_dir = kwargs.pop('cache_dir', None) + force_download = kwargs.pop('force_download', False) + resume_download = kwargs.pop('resume_download', False) + proxies = kwargs.pop('proxies', None) + return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) + + if pretrained_model_name_or_path in ALL_MODELS_MAP: + model_card_file = ALL_MODELS_MAP[pretrained_model_name_or_path] + model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) # For simplicity we use the same pretrained url than config but with a different suffix + elif os.path.isdir(pretrained_model_name_or_path): + model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) + elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + model_card_file = pretrained_model_name_or_path + else: + model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME) + # redirect to the cache, if necessary + try: + resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download, + proxies=proxies, resume_download=resume_download) + + if resolved_model_card_file == model_card_file: + logger.info("loading model card file {}".format(model_card_file)) + else: + logger.info("loading model card file {} from cache at {}".format( + model_card_file, resolved_model_card_file)) + + # Load model card + model_card = cls.from_json_file(resolved_model_card_file) + + except EnvironmentError: + if pretrained_model_name_or_path in ALL_MODELS_MAP: + logger.warning("Couldn't reach server at '{}' to download model card file.".format( + model_card_file)) + else: + logger.warning("Model name '{}' was not found in model name list ({}). " \ + "We assumed '{}' was a path or url to a model card file named {} or " \ + "a directory containing such a file but couldn't find any such file at this path or url.".format( + pretrained_model_name_or_path, + ', '.join(ALL_MODELS_MAP.keys()), + model_card_file, MODEL_CARD_NAME)) + + logger.warning("Creating an empty model card.") + + # We fall back on creating an empty model card + model_card = cls() + + # Update model card with kwargs if needed + to_remove = [] + for key, value in kwargs.items(): + if hasattr(model_card, key): + setattr(model_card, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + + logger.info("Model card: %s", str(model_card)) + if return_unused_kwargs: + return model_card, kwargs + else: + return model_card + + @classmethod + def from_dict(cls, json_object): + """Constructs a `ModelCard` from a Python dictionary of parameters.""" + return cls(**json_object) + + @classmethod + def from_json_file(cls, json_file): + """Constructs a `ModelCard` from a json file of parameters.""" + with open(json_file, "r", encoding='utf-8') as reader: + text = reader.read() + dict_obj = json.loads(text) + return cls(**dict_obj) + + def __eq__(self, other): + return self.__dict__ == other.__dict__ + + def __repr__(self): + return str(self.to_json_string()) + + def to_dict(self): + """Serializes this instance to a Python dictionary.""" + output = copy.deepcopy(self.__dict__) + return output + + def to_json_string(self): + """Serializes this instance to a JSON string.""" + return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path): + """ Save this instance to a json file.""" + with open(json_file_path, "w", encoding='utf-8') as writer: + writer.write(self.to_json_string()) diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py new file mode 100644 index 0000000000..4364cbacec --- /dev/null +++ b/transformers/tests/model_card_test.py @@ -0,0 +1,87 @@ +# coding=utf-8 +# Copyright 2019 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import absolute_import, division, print_function, unicode_literals + +import os +import sys +import json +import tempfile +import shutil +import unittest + +from transformers.model_card import ModelCard +from .tokenization_tests_commons import TemporaryDirectory + +class ModelCardTester(unittest.TestCase): + + def setUp(self): + self.inputs_dict = {'model_details': { + 'Organization': 'testing', + 'Model date': 'today', + 'Model version': 'v2.1, Developed by Test Corp in 2019.', + 'Architecture': 'Convolutional Neural Network.', + }, + 'metrics': 'BLEU and ROUGE-1', + 'evaluation_data':{ + 'Datasets':{ + 'BLEU': 'My-great-dataset-v1', + 'ROUGE-1': 'My-short-dataset-v2.1', + }, + 'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf' + }, + 'training_data':{ + 'Dataset': 'English Wikipedia dump dated 2018-12-01', + 'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf' + }, + 'quantitative_analyses': { + 'BLEU': 55.1, + 'ROUGE-1': 76, + }, + } + self.tmpdirname = tempfile.mkdtemp() + + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + def test_model_card_common_properties(self): + model_card = ModelCard.from_dict(self.inputs_dict) + self.assertTrue(hasattr(model_card, 'model_details')) + self.assertTrue(hasattr(model_card, 'intended_use')) + self.assertTrue(hasattr(model_card, 'factors')) + self.assertTrue(hasattr(model_card, 'metrics')) + self.assertTrue(hasattr(model_card, 'evaluation_data')) + self.assertTrue(hasattr(model_card, 'training_data')) + self.assertTrue(hasattr(model_card, 'quantitative_analyses')) + self.assertTrue(hasattr(model_card, 'ethical_considerations')) + self.assertTrue(hasattr(model_card, 'caveats_and_recommendations')) + + def test_model_card_to_json_string(self): + model_card = ModelCard.from_dict(self.inputs_dict) + obj = json.loads(model_card.to_json_string()) + for key, value in self.inputs_dict.items(): + self.assertEqual(obj[key], value) + + def test_model_card_to_json_file(self): + model_card_first = ModelCard.from_dict(self.inputs_dict) + + with TemporaryDirectory() as tmpdirname: + filename = os.path.join(tmpdirname, u"model_card.json") + model_card_first.to_json_file(filename) + model_card_second = ModelCard.from_json_file(filename) + + self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) + +if __name__ == "__main__": + unittest.main() From d3418a94ff4256725a690bd9c8167489b6f593b8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 13:52:41 +0100 Subject: [PATCH 156/302] update tests --- .../tests/configuration_common_test.py | 27 ++++++++++++------- transformers/tests/model_card_test.py | 16 ++++++----- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py index 8ee751153c..376d110d3c 100644 --- a/transformers/tests/configuration_common_test.py +++ b/transformers/tests/configuration_common_test.py @@ -16,15 +16,12 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -import copy import os -import shutil import json -import random -import uuid +import tempfile import unittest -import logging +from .tokenization_tests_commons import TemporaryDirectory class ConfigTester(object): @@ -48,16 +45,28 @@ class ConfigTester(object): def create_and_test_config_to_json_file(self): config_first = self.config_class(**self.inputs_dict) - json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json") - config_first.to_json_file(json_file_path) - config_second = self.config_class.from_json_file(json_file_path) - os.remove(json_file_path) + + with TemporaryDirectory() as tmpdirname: + json_file_path = os.path.join(tmpdirname, "config.json") + config_first.to_json_file(json_file_path) + config_second = self.config_class.from_json_file(json_file_path) + + self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) + + def create_and_test_config_from_and_save_pretrained(self): + config_first = self.config_class(**self.inputs_dict) + + with TemporaryDirectory() as tmpdirname: + config_first.save_pretrained(tmpdirname) + config_second = self.config_class.from_pretrained(tmpdirname) + self.parent.assertEqual(config_second.to_dict(), config_first.to_dict()) def run_common_tests(self): self.create_and_test_config_common_properties() self.create_and_test_config_to_json_string() self.create_and_test_config_to_json_file() + self.create_and_test_config_from_and_save_pretrained() if __name__ == "__main__": unittest.main() \ No newline at end of file diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py index 4364cbacec..e75716f0aa 100644 --- a/transformers/tests/model_card_test.py +++ b/transformers/tests/model_card_test.py @@ -15,10 +15,7 @@ from __future__ import absolute_import, division, print_function, unicode_literals import os -import sys import json -import tempfile -import shutil import unittest from transformers.model_card import ModelCard @@ -50,10 +47,6 @@ class ModelCardTester(unittest.TestCase): 'ROUGE-1': 76, }, } - self.tmpdirname = tempfile.mkdtemp() - - def tearDown(self): - shutil.rmtree(self.tmpdirname) def test_model_card_common_properties(self): model_card = ModelCard.from_dict(self.inputs_dict) @@ -83,5 +76,14 @@ class ModelCardTester(unittest.TestCase): self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) + def test_model_card_from_and_save_pretrained(self): + model_card_first = ModelCard.from_dict(self.inputs_dict) + + with TemporaryDirectory() as tmpdirname: + model_card_first.save_pretrained(tmpdirname) + model_card_second = ModelCard.from_pretrained(tmpdirname) + + self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict()) + if __name__ == "__main__": unittest.main() From a4d07b983a6c1716b4d39cf3fed570562aebf3f7 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 14:00:32 +0100 Subject: [PATCH 157/302] dict of all config and model files cc @LysandreJik --- transformers/__init__.py | 6 ++--- transformers/configuration_auto.py | 42 ++++++++++++++++++++--------- transformers/model_card.py | 43 +++++------------------------- transformers/modeling_auto.py | 42 ++++++++++++++++++++--------- transformers/modeling_tf_auto.py | 38 +++++++++++++++++++------- 5 files changed, 98 insertions(+), 73 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 15c167a5ce..0b343bed2b 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -55,7 +55,7 @@ from .tokenization_t5 import T5Tokenizer # Configurations from .configuration_utils import PretrainedConfig -from .configuration_auto import AutoConfig +from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP @@ -73,7 +73,7 @@ from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP if is_torch_available(): from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D) from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering, - AutoModelWithLMHead) + AutoModelWithLMHead, ALL_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction, @@ -131,7 +131,7 @@ if is_torch_available(): if is_tf_available(): from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering, - TFAutoModelWithLMHead) + TFAutoModelWithLMHead, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP) from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings, TFBertModel, TFBertForPreTraining, diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index 680c55fa54..9fe58f173a 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .configuration_bert import BertConfig -from .configuration_openai import OpenAIGPTConfig -from .configuration_gpt2 import GPT2Config -from .configuration_transfo_xl import TransfoXLConfig -from .configuration_xlnet import XLNetConfig -from .configuration_xlm import XLMConfig -from .configuration_roberta import RobertaConfig -from .configuration_distilbert import DistilBertConfig -from .configuration_ctrl import CTRLConfig -from .configuration_camembert import CamembertConfig -from .configuration_albert import AlbertConfig -from .configuration_t5 import T5Config +from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP logger = logging.getLogger(__name__) +ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value) + for pretrained_map in [ + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, + TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, + GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, + CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, + ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + T5_PRETRAINED_CONFIG_ARCHIVE_MAP, + ] + for key, value, in pretrained_map.items()) + + class AutoConfig(object): r""":class:`~transformers.AutoConfig` is a generic configuration class that will be instantiated as one of the configuration classes of the library diff --git a/transformers/model_card.py b/transformers/model_card.py index 679c24872a..6d56089844 100644 --- a/transformers/model_card.py +++ b/transformers/model_card.py @@ -21,21 +21,9 @@ import copy import json import logging import os -import re from io import open -from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP -from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url @@ -43,24 +31,6 @@ from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url logger = logging.getLogger(__name__) -ALL_MODELS_MAP = dict((key, value) - for pretrained_map in [ - BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, - TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, - GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, - CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, - XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, - ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, - DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, - T5_PRETRAINED_CONFIG_ARCHIVE_MAP, - ] - for key, value, in pretrained_map.items()) - - class ModelCard(object): r""" Model Card class. Store model card as well as methods for loading/downloading/saving model cards. @@ -159,9 +129,10 @@ class ModelCard(object): proxies = kwargs.pop('proxies', None) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) - if pretrained_model_name_or_path in ALL_MODELS_MAP: - model_card_file = ALL_MODELS_MAP[pretrained_model_name_or_path] - model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) # For simplicity we use the same pretrained url than config but with a different suffix + if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json) + model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] + model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) elif os.path.isdir(pretrained_model_name_or_path): model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): @@ -183,7 +154,7 @@ class ModelCard(object): model_card = cls.from_json_file(resolved_model_card_file) except EnvironmentError: - if pretrained_model_name_or_path in ALL_MODELS_MAP: + if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: logger.warning("Couldn't reach server at '{}' to download model card file.".format( model_card_file)) else: @@ -191,7 +162,7 @@ class ModelCard(object): "We assumed '{}' was a path or url to a model card file named {} or " \ "a directory containing such a file but couldn't find any such file at this path or url.".format( pretrained_model_name_or_path, - ', '.join(ALL_MODELS_MAP.keys()), + ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), model_card_file, MODEL_CARD_NAME)) logger.warning("Creating an empty model card.") diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 19a54cca86..1a30ea4623 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -18,18 +18,18 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering -from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel -from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel -from .modeling_ctrl import CTRLModel, CTRLLMHeadModel -from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel -from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering -from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering -from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification -from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification -from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice -from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering -from .modeling_t5 import T5Model, T5WithLMHeadModel +from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_utils import PreTrainedModel, SequenceSummary @@ -38,6 +38,24 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) +ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) + for pretrained_map in [ + BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + T5_PRETRAINED_MODEL_ARCHIVE_MAP, + ] + for key, value, in pretrained_map.items()) + + class AutoModel(object): r""" :class:`~transformers.AutoModel` is a generic model class diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index b4ff660098..9c687d9235 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera import logging -from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering -from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel -from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel -from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel -from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple -from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple -from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification -from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification -from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel -from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel +from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) +TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) + for pretrained_map in [ + TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, + ] + for key, value, in pretrained_map.items()) + + class TFAutoModel(object): r""" :class:`~transformers.TFAutoModel` is a generic model class From db0a9ee6e0ddcb9d634c3ab0ba3d25501c370d8c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 14:08:08 +0100 Subject: [PATCH 158/302] adding albert to TF auto models cc @LysandreJik --- transformers/modeling_tf_auto.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py index 9c687d9235..3e9b4d120b 100644 --- a/transformers/modeling_tf_auto.py +++ b/transformers/modeling_tf_auto.py @@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP from .file_utils import add_start_docstrings @@ -46,7 +47,6 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, - TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP, ] for key, value, in pretrained_map.items()) @@ -162,6 +162,8 @@ class TFAutoModel(object): return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'distilbert' in pretrained_model_name_or_path: return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'albert' in pretrained_model_name_or_path: + return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -298,6 +300,8 @@ class TFAutoModelWithLMHead(object): return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'distilbert' in pretrained_model_name_or_path: return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'albert' in pretrained_model_name_or_path: + return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -425,6 +429,8 @@ class TFAutoModelForSequenceClassification(object): """ if 'distilbert' in pretrained_model_name_or_path: return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'albert' in pretrained_model_name_or_path: + return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: From 031ad4eb3780437d5232392b16891078b1b32d2c Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 14:20:57 +0100 Subject: [PATCH 159/302] improving JSON error messages (for model card and configurations) --- transformers/configuration_utils.py | 15 +++++++++++---- transformers/model_card.py | 12 ++++++++---- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 6c9eeea175..f692c9b132 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -151,10 +151,14 @@ class PretrainedConfig(object): config_file = pretrained_model_name_or_path else: config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME) - # redirect to the cache, if necessary + try: + # Load from URL or cache if already cached resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download) + # Load config + config = cls.from_json_file(resolved_config_file) + except EnvironmentError: if pretrained_model_name_or_path in cls.pretrained_config_archive_map: msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format( @@ -168,15 +172,18 @@ class PretrainedConfig(object): config_file, CONFIG_NAME) raise EnvironmentError(msg) + except json.JSONDecodeError: + msg = "Couldn't reach server at '{}' to download configuration file or " \ + "configuration file is not a valid JSON file. " \ + "Please check network or file content here: {}.".format(config_file, resolved_config_file) + raise EnvironmentError(msg) + if resolved_config_file == config_file: logger.info("loading configuration file {}".format(config_file)) else: logger.info("loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) - # Load config - config = cls.from_json_file(resolved_config_file) - if hasattr(config, 'pruned_heads'): config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items()) diff --git a/transformers/model_card.py b/transformers/model_card.py index 6d56089844..3c775ab7fc 100644 --- a/transformers/model_card.py +++ b/transformers/model_card.py @@ -132,7 +132,7 @@ class ModelCard(object): if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json) model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] - model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) + model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) elif os.path.isdir(pretrained_model_name_or_path): model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): @@ -143,13 +143,11 @@ class ModelCard(object): try: resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download) - if resolved_model_card_file == model_card_file: logger.info("loading model card file {}".format(model_card_file)) else: logger.info("loading model card file {} from cache at {}".format( model_card_file, resolved_model_card_file)) - # Load model card model_card = cls.from_json_file(resolved_model_card_file) @@ -164,9 +162,15 @@ class ModelCard(object): pretrained_model_name_or_path, ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), model_card_file, MODEL_CARD_NAME)) - logger.warning("Creating an empty model card.") + # We fall back on creating an empty model card + model_card = cls() + except json.JSONDecodeError: + logger.warning("Couldn't reach server at '{}' to download model card file or " + "model card file is not a valid JSON file. " + "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)) + logger.warning("Creating an empty model card.") # We fall back on creating an empty model card model_card = cls() From 955d7ecb570b178187075c7c31fcd9be2e3a3428 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 14:34:54 +0100 Subject: [PATCH 160/302] Refactored Pipeline with dedicated argument handler. --- transformers/pipelines.py | 210 ++++++++++++++++++++------------------ 1 file changed, 112 insertions(+), 98 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 7383222c1f..7e2b30ba3c 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -36,29 +36,40 @@ if is_torch_available(): AutoModelForQuestionAnswering, AutoModelForTokenClassification -class Pipeline(ABC): - def __init__(self, model, tokenizer: PreTrainedTokenizer = None, **kwargs): - self.model = model - self.tokenizer = tokenizer +class ArgumentHandler(ABC): + """ + Base interface for handling varargs for each Pipeline + """ + @abstractmethod + def __call__(self, *args, **kwargs): + raise NotImplementedError() - def save_pretrained(self, save_directory): - if not os.path.isdir(save_directory): - logger.error("Provided path ({}) should be a directory".format(save_directory)) - return - self.model.save_pretrained(save_directory) - self.tokenizer.save_pretrained(save_directory) +class DefaultArgumentHandler(ArgumentHandler): + """ + Default varargs argument parser handling parameters for each Pipeline + """ + def __call__(self, *args, **kwargs): + if 'X' in kwargs: + return kwargs['X'] + elif 'data' in kwargs: + return kwargs['data'] + elif len(args) > 0: + return list(args) + raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)') - def transform(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) - def predict(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - return self(*texts, **kwargs) +class _ScikitCompat(ABC): + """ + Interface layer for the Scikit and Keras compatibility. + """ @abstractmethod - def __call__(self, *texts, **kwargs): + def transform(self, X): + raise NotImplementedError() + + @abstractmethod + def predict(self, X): raise NotImplementedError() @@ -133,24 +144,45 @@ class JsonPipelineDataFormat(PipelineDataFormat): if self.is_multi_columns: yield {k: entry[c] for k, c in self.column} else: - yield entry[self.column] + yield entry[self.column[0]] def save(self, data: dict): with open(self.output, 'w') as f: json.dump(data, f) -class FeatureExtractionPipeline(Pipeline): +class Pipeline(_ScikitCompat): + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, args_parser: ArgumentHandler = None, **kwargs): + self.model = model + self.tokenizer = tokenizer + self._args_parser = args_parser or DefaultArgumentHandler() + + def save_pretrained(self, save_directory): + if not os.path.isdir(save_directory): + logger.error("Provided path ({}) should be a directory".format(save_directory)) + return + + self.model.save_pretrained(save_directory) + self.tokenizer.save_pretrained(save_directory) + + def transform(self, X): + return self(X=X) + + def predict(self, X): + return self(X=X) def __call__(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - if 'X' in kwargs and not texts: - texts = kwargs.pop('X') + # Parse arguments + inputs = self._args_parser(*texts, **kwargs) + # Encode for forward inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' ) + return self._forward(inputs) + + def _forward(self, inputs): if is_tf_available(): # TODO trace model predictions = self.model(inputs)[0] @@ -159,7 +191,12 @@ class FeatureExtractionPipeline(Pipeline): with torch.no_grad(): predictions = self.model(**inputs)[0] - return predictions.numpy().tolist() + return predictions.numpy() + + +class FeatureExtractionPipeline(Pipeline): + def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs).tolist() class TextClassificationPipeline(Pipeline): @@ -170,26 +207,8 @@ class TextClassificationPipeline(Pipeline): raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) self._nb_classes = nb_classes - def __call__(self, *texts, **kwargs): - # Generic compatibility with sklearn and Keras - if 'X' in kwargs and not texts: - texts = kwargs.pop('X') - - inputs = self.tokenizer.batch_encode_plus( - texts, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' - ) - - special_tokens_mask = inputs.pop('special_tokens_mask') - - if is_tf_available(): - # TODO trace model - predictions = self.model(**inputs)[0] - else: - import torch - with torch.no_grad(): - predictions = self.model(**inputs)[0] - - return predictions.numpy().tolist() + def __call__(self, *args, **kwargs): + return super().__call__(*args, **kwargs).tolist() class NerPipeline(Pipeline): @@ -198,8 +217,7 @@ class NerPipeline(Pipeline): super().__init__(model, tokenizer) def __call__(self, *texts, **kwargs): - (texts, ), answers = texts, [] - + inputs, answers = self._args_parser(*texts, **kwargs), [] for sentence in texts: # Ugly token to word idx mapping (for now) @@ -241,9 +259,52 @@ class QuestionAnsweringPipeline(Pipeline): Question Answering pipeline involving Tokenization and Inference. """ - @classmethod - def from_config(cls, model, tokenizer: PreTrainedTokenizer, **kwargs): - pass + class QuestionAnsweringArgumentHandler(ArgumentHandler): + + def __call__(self, *args, **kwargs): + # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating + if args is not None and len(args) > 1: + kwargs['X'] = args + + # Generic compatibility with sklearn and Keras + # Batched data + if 'X' in kwargs or 'data' in kwargs: + data = kwargs['X'] if 'X' in kwargs else kwargs['data'] + + if not isinstance(data, list): + data = [data] + + for i, item in enumerate(data): + if isinstance(item, dict): + if any(k not in item for k in ['question', 'context']): + raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') + data[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif isinstance(item, SquadExample): + continue + else: + raise ValueError( + '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' + .format('X' if 'X' in kwargs else 'data') + ) + inputs = data + + # Tabular input + elif 'question' in kwargs and 'context' in kwargs: + if isinstance(kwargs['question'], str): + kwargs['question'] = [kwargs['question']] + + if isinstance(kwargs['context'], str): + kwargs['context'] = [kwargs['context']] + + inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] + else: + raise ValueError('Unknown arguments {}'.format(kwargs)) + + if not isinstance(inputs, list): + inputs = [inputs] + + return inputs @staticmethod def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: @@ -254,54 +315,8 @@ class QuestionAnsweringPipeline(Pipeline): else: return SquadExample(None, question, context, None, None, None) - @staticmethod - def handle_args(*inputs, **kwargs) -> List[SquadExample]: - # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating - if inputs is not None and len(inputs) > 1: - kwargs['X'] = inputs - - # Generic compatibility with sklearn and Keras - # Batched data - if 'X' in kwargs or 'data' in kwargs: - data = kwargs['X'] if 'X' in kwargs else kwargs['data'] - - if not isinstance(data, list): - data = [data] - - for i, item in enumerate(data): - if isinstance(item, dict): - if any(k not in item for k in ['question', 'context']): - raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') - data[i] = QuestionAnsweringPipeline.create_sample(**item) - - elif isinstance(item, SquadExample): - continue - else: - raise ValueError( - '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' - .format('X' if 'X' in kwargs else 'data') - ) - inputs = data - - # Tabular input - elif 'question' in kwargs and 'context' in kwargs: - if isinstance(kwargs['question'], str): - kwargs['question'] = [kwargs['question']] - - if isinstance(kwargs['context'], str): - kwargs['context'] = [kwargs['context']] - - inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] - else: - raise ValueError('Unknown arguments {}'.format(kwargs)) - - if not isinstance(inputs, list): - inputs = [inputs] - - return inputs - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): - super().__init__(model, tokenizer) + super().__init__(model, tokenizer, args_parser=QuestionAnsweringPipeline.QuestionAnsweringArgumentHandler()) def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: args = ['input_ids', 'attention_mask'] @@ -332,9 +347,8 @@ class QuestionAnsweringPipeline(Pipeline): if kwargs['max_answer_len'] < 1: raise ValueError('max_answer_len parameter should be >= 1 (got {})'.format(kwargs['max_answer_len'])) - examples = QuestionAnsweringPipeline.handle_args(texts, **kwargs) - # Convert inputs to features + examples = self._args_parser(*texts, **kwargs) features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) fw_args = self.inputs_for_model(features) From 1bbdbacd5bc7281dbcebfe4330a464a7ad1a6e72 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 14:38:20 +0100 Subject: [PATCH 161/302] update __init__ and saving --- transformers/__init__.py | 2 +- transformers/model_card.py | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index 0b343bed2b..44447c5495 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -19,7 +19,7 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name # Files and general utilities from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path, add_start_docstrings, add_end_docstrings, - WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, + WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME, is_tf_available, is_torch_available) from .data import (is_sklearn_available, diff --git a/transformers/model_card.py b/transformers/model_card.py index 3c775ab7fc..baec7e8622 100644 --- a/transformers/model_card.py +++ b/transformers/model_card.py @@ -67,14 +67,14 @@ class ModelCard(object): logger.error("Can't set {} with value {} for {}".format(key, value, self)) raise err - def save_pretrained(self, save_directory): - """ Save a model card object to the directory `save_directory`, so that it - can be re-loaded using the :func:`~transformers.ModelCard.from_pretrained` class method. + def save_pretrained(self, save_directory_or_file): + """ Save a model card object to the directory or file `save_directory_or_file`. """ - assert os.path.isdir(save_directory), "Saving path should be a directory where the model card can be saved" - - # If we save using the predefined names, we can load using `from_pretrained` - output_model_card_file = os.path.join(save_directory, MODEL_CARD_NAME) + if os.path.isdir(save_directory_or_file): + # If we save using the predefined names, we can load using `from_pretrained` + output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME) + else: + output_model_card_file = save_directory_or_file self.to_json_file(output_model_card_file) logger.info("Model card saved in {}".format(output_model_card_file)) @@ -139,8 +139,9 @@ class ModelCard(object): model_card_file = pretrained_model_name_or_path else: model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME) - # redirect to the cache, if necessary + try: + # Load from URL or cache if already cached resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies, resume_download=resume_download) if resolved_model_card_file == model_card_file: @@ -163,6 +164,7 @@ class ModelCard(object): ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), model_card_file, MODEL_CARD_NAME)) logger.warning("Creating an empty model card.") + # We fall back on creating an empty model card model_card = cls() @@ -171,6 +173,7 @@ class ModelCard(object): "model card file is not a valid JSON file. " "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file)) logger.warning("Creating an empty model card.") + # We fall back on creating an empty model card model_card = cls() From 9c391277cc380b1d1eba17fd7b3337c90b35987e Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 15:19:13 +0100 Subject: [PATCH 162/302] Allow tensors placement on specific device through CLI and pipeline. --- transformers/commands/run.py | 3 +- transformers/pipelines.py | 77 ++++++++++++++++++++++++------------ 2 files changed, 54 insertions(+), 26 deletions(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index bcbb87391d..b4951b1bc2 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -16,7 +16,7 @@ def try_infer_format_from_ext(path: str): def run_command_factory(args): - nlp = pipeline(task=args.task, model=args.model, tokenizer=args.tokenizer) + nlp = pipeline(task=args.task, model=args.model, tokenizer=args.tokenizer, device=args.device) format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column) return RunCommand(nlp, reader) @@ -31,6 +31,7 @@ class RunCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") + run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU') run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 7e2b30ba3c..5b0e81957b 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -18,6 +18,7 @@ import csv import json import os from abc import ABC, abstractmethod +from contextlib import contextmanager from itertools import groupby from typing import Union, Optional, Tuple, List, Dict @@ -152,11 +153,18 @@ class JsonPipelineDataFormat(PipelineDataFormat): class Pipeline(_ScikitCompat): - def __init__(self, model, tokenizer: PreTrainedTokenizer = None, args_parser: ArgumentHandler = None, **kwargs): + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, + args_parser: ArgumentHandler = None, device: int = -1, **kwargs): + self.model = model self.tokenizer = tokenizer + self.device = device self._args_parser = args_parser or DefaultArgumentHandler() + # Special handling + if self.device >= 0 and not is_tf_available(): + self.model = self.model.to('cuda:{}'.format(self.device)) + def save_pretrained(self, save_directory): if not os.path.isdir(save_directory): logger.error("Provided path ({}) should be a directory".format(save_directory)) @@ -176,11 +184,25 @@ class Pipeline(_ScikitCompat): inputs = self._args_parser(*texts, **kwargs) # Encode for forward - inputs = self.tokenizer.batch_encode_plus( - inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' - ) + with self.device_placement(): + inputs = self.tokenizer.batch_encode_plus( + inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) - return self._forward(inputs) + return self._forward(inputs) + + @contextmanager + def device_placement(self): + if is_tf_available(): + import tensorflow as tf + with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)): + yield + else: + import torch + if self.device >= 0: + torch.cuda.set_device(self.device) + + yield def _forward(self, inputs): if is_tf_available(): @@ -225,14 +247,17 @@ class NerPipeline(Pipeline): for i, w in enumerate(words): tokens = self.tokenizer.tokenize(w) token_to_word += [i] * len(tokens) - tokens = self.tokenizer.encode_plus(sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt') - # Forward - if is_torch_available(): - with torch.no_grad(): - entities = self.model(**tokens)[0][0].cpu().numpy() - else: - entities = self.model(tokens)[0][0].numpy() + # Manage correct placement of the tensors + with self.device_placement(): + tokens = self.tokenizer.encode_plus(sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt') + + # Forward + if is_torch_available(): + with torch.no_grad(): + entities = self.model(**tokens)[0][0].cpu().numpy() + else: + entities = self.model(tokens)[0][0].numpy() # Normalize scores answer, token_start = [], 1 @@ -352,18 +377,20 @@ class QuestionAnsweringPipeline(Pipeline): features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) fw_args = self.inputs_for_model(features) - if is_tf_available(): - import tensorflow as tf - fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} - start, end = self.model(fw_args) - start, end = start.numpy(), end.numpy() - else: - import torch - with torch.no_grad(): - # Retrieve the score for the context tokens only (removing question tokens) - fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} - start, end = self.model(**fw_args) - start, end = start.cpu().numpy(), end.cpu().numpy() + # Manage tensor allocation on correct device + with self.device_placement(): + if is_tf_available(): + import tensorflow as tf + fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} + start, end = self.model(fw_args) + start, end = start.numpy(), end.numpy() + else: + import torch + with torch.no_grad(): + # Retrieve the score for the context tokens only (removing question tokens) + fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} + start, end = self.model(**fw_args) + start, end = start.cpu().numpy(), end.cpu().numpy() answers = [] for (example, feature, start_, end_) in zip(examples, features, start, end): @@ -374,7 +401,7 @@ class QuestionAnsweringPipeline(Pipeline): # Mask padding and question start_, end_ = start_ * np.abs(np.array(feature.p_mask) - 1), end_ * np.abs(np.array(feature.p_mask) - 1) - # TODO : What happend if not possible + # TODO : What happens if not possible # Mask CLS start_[0] = end_[0] = 0 From bbc707cf394776634bd433c895a9223fe9b256a9 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 15:49:09 +0100 Subject: [PATCH 163/302] Fix non-keyworded varargs handling in DefaultArgumentHandler for pipeline. --- transformers/pipelines.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 5b0e81957b..ee551893b0 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -55,7 +55,12 @@ class DefaultArgumentHandler(ArgumentHandler): return kwargs['X'] elif 'data' in kwargs: return kwargs['data'] - elif len(args) > 0: + elif len(args) == 1: + if isinstance(args[0], list): + return args[0] + else: + return [args[0]] + elif len(args) > 1: return list(args) raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)') @@ -240,7 +245,7 @@ class NerPipeline(Pipeline): def __call__(self, *texts, **kwargs): inputs, answers = self._args_parser(*texts, **kwargs), [] - for sentence in texts: + for sentence in inputs: # Ugly token to word idx mapping (for now) token_to_word, words = [], sentence.split(' ') From 46ccbb42fc89876461995e0ba553d72cffa700ce Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 15:49:41 +0100 Subject: [PATCH 164/302] Make CLI run command use integer mapping for device argument. --- transformers/commands/run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index b4951b1bc2..8c203699a8 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -31,7 +31,7 @@ class RunCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") - run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU') + run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') From 43a4e1bbe4091b11d7926b93250cc87fa75bd545 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 16:00:41 +0100 Subject: [PATCH 165/302] Adressing issue in varargs handling for question answering. --- transformers/pipelines.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index ee551893b0..2a8f26b03e 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -293,8 +293,11 @@ class QuestionAnsweringPipeline(Pipeline): def __call__(self, *args, **kwargs): # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating - if args is not None and len(args) > 1: - kwargs['X'] = args + if args is not None and len(args) > 0: + if len(args) == 1: + kwargs['X'] = args[0] + else: + kwargs['X'] = list(args) # Generic compatibility with sklearn and Keras # Batched data From 71b47505175111dd391a5b9de9514fbe50558bf0 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 16:37:27 +0100 Subject: [PATCH 166/302] examples: add support for XLM-RoBERTa to run_ner script --- examples/run_ner.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/run_ner.py b/examples/run_ner.py index 1ab1236d94..6426a6d1db 100644 --- a/examples/run_ner.py +++ b/examples/run_ner.py @@ -38,11 +38,13 @@ from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, B from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer +from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer logger = logging.getLogger(__name__) ALL_MODELS = sum( - (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), + (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig, + CamembertConfig, XLMRobertaConfig)), ()) MODEL_CLASSES = { @@ -50,6 +52,7 @@ MODEL_CLASSES = { "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer), "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer), "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer), + "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer), } From a096e2a88beae06c3341bc502b122d77be72571b Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Mon, 16 Dec 2019 16:38:02 +0100 Subject: [PATCH 167/302] WIP serving through HTTP internally using pipelines. --- transformers/commands/serving.py | 67 ++++++++++---------------------- 1 file changed, 21 insertions(+), 46 deletions(-) diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py index 0b47246ead..a35dff0ebe 100644 --- a/transformers/commands/serving.py +++ b/transformers/commands/serving.py @@ -1,15 +1,15 @@ from argparse import ArgumentParser, Namespace from typing import List, Optional, Union, Any -import torch from fastapi import FastAPI, HTTPException, Body from logging import getLogger from pydantic import BaseModel from uvicorn import run -from transformers import AutoModel, AutoTokenizer, AutoConfig +from transformers import Pipeline from transformers.commands import BaseTransformersCLICommand +from transformers.pipelines import SUPPORTED_TASKS, pipeline def serve_command_factory(args: Namespace): @@ -17,7 +17,8 @@ def serve_command_factory(args: Namespace): Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ - return ServeCommand(args.host, args.port, args.model, args.graphql) + nlp = pipeline(args.task, args.model) + return ServeCommand(nlp, args.host, args.port, args.model, args.graphql) class ServeResult(BaseModel): @@ -53,8 +54,6 @@ class ServeForwardResult(ServeResult): """ Forward result model """ - tokens: List[str] - tokens_ids: List[int] output: Any @@ -68,19 +67,18 @@ class ServeCommand(BaseTransformersCLICommand): :return: """ serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.') + serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on') + serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.') serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.') serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model to infer from.') serve_parser.add_argument('--graphql', action='store_true', default=False, help='Enable GraphQL endpoints.') serve_parser.set_defaults(func=serve_command_factory) - def __init__(self, host: str, port: int, model: str, graphql: bool): + def __init__(self, pipeline: Pipeline, host: str, port: int, model: str, graphql: bool): self._logger = getLogger('transformers-cli/serving') - self._logger.info('Loading model {}'.format(model)) - self._model_name = model - self._model = AutoModel.from_pretrained(model) - self._tokenizer = AutoTokenizer.from_pretrained(model) + self._pipeline = pipeline self._logger.info('Serving model over {}:{}'.format(host, port)) self._host = host @@ -97,7 +95,7 @@ class ServeCommand(BaseTransformersCLICommand): run(self._app, host=self._host, port=self._port) def model_info(self): - return ServeModelInfoResult(model=self._model_name, infos=vars(self._model.config)) + return ServeModelInfoResult(model='', infos=vars(self._pipeline.model.config)) def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): """ @@ -106,16 +104,16 @@ class ServeCommand(BaseTransformersCLICommand): - **return_ids**: Boolean flags indicating if the tokens have to be converted to their integer mapping. """ try: - tokens_txt = self._tokenizer.tokenize(text_input) + tokens_txt = self._pipeline.tokenizer.tokenize(text_input) if return_ids: - tokens_ids = self._tokenizer.convert_tokens_to_ids(tokens_txt) - return ServeTokenizeResult(model=self._model_name, tokens=tokens_txt, tokens_ids=tokens_ids) + tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt) + return ServeTokenizeResult(model='', tokens=tokens_txt, tokens_ids=tokens_ids) else: - return ServeTokenizeResult(model=self._model_name, tokens=tokens_txt) + return ServeTokenizeResult(model='', tokens=tokens_txt) except Exception as e: - raise HTTPException(status_code=500, detail={"model": self._model_name, "error": str(e)}) + raise HTTPException(status_code=500, detail={"model": '', "error": str(e)}) def detokenize(self, tokens_ids: List[int] = Body(None, embed=True), skip_special_tokens: bool = Body(False, embed=True), @@ -127,14 +125,12 @@ class ServeCommand(BaseTransformersCLICommand): - **cleanup_tokenization_spaces**: Flag indicating to remove all leading/trailing spaces and intermediate ones. """ try: - decoded_str = self._tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) - return ServeDeTokenizeResult(model=self._model_name, text=decoded_str) + decoded_str = self._pipeline.tokenizer.decode(tokens_ids, skip_special_tokens, cleanup_tokenization_spaces) + return ServeDeTokenizeResult(model='', text=decoded_str) except Exception as e: - raise HTTPException(status_code=500, detail={"model": self._model_name, "error": str(e)}) + raise HTTPException(status_code=500, detail={"model": '', "error": str(e)}) - def forward(self, inputs: Union[str, List[str], List[int]] = Body(None, embed=True), - attention_mask: Optional[List[int]] = Body(None, embed=True), - tokens_type_ids: Optional[List[int]] = Body(None, embed=True)): + def forward(self, inputs: Union[str, dict, List[str], List[int], List[dict]] = Body(None, embed=True)): """ **inputs**: **attention_mask**: @@ -143,34 +139,13 @@ class ServeCommand(BaseTransformersCLICommand): # Check we don't have empty string if len(inputs) == 0: - return ServeForwardResult(model=self._model_name, output=[], attention=[]) - - if isinstance(inputs, str): - inputs_tokens = self._tokenizer.tokenize(inputs) - inputs_ids = self._tokenizer.convert_tokens_to_ids(inputs_tokens) - - elif isinstance(inputs, List): - if isinstance(inputs[0], str): - inputs_tokens = inputs - inputs_ids = self._tokenizer.convert_tokens_to_ids(inputs_tokens) - elif isinstance(inputs[0], int): - inputs_tokens = [] - inputs_ids = inputs - else: - error_msg = "inputs should be string, [str] of [int] (got {})".format(type(inputs[0])) - raise HTTPException(423, detail={"error": error_msg}) - else: - error_msg = "inputs should be string, [str] of [int] (got {})".format(type(inputs)) - raise HTTPException(423, detail={"error": error_msg}) + return ServeForwardResult(model='', output=[], attention=[]) try: # Forward through the model - t_input_ids = torch.tensor(inputs_ids).unsqueeze(0) - output = self._model(t_input_ids, attention_mask, tokens_type_ids) - + output = self._pipeline(inputs) return ServeForwardResult( - model=self._model_name, tokens=inputs_tokens, - tokens_ids=inputs_ids, output=output[0].tolist() + model='', output=output ) except Exception as e: raise HTTPException(500, {"error": str(e)}) From d3549b66af6f225cace48f8462ba715508f51b0d Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 16:38:39 +0100 Subject: [PATCH 168/302] module: add support for XLM-RoBERTa (__init__) --- transformers/__init__.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/transformers/__init__.py b/transformers/__init__.py index 740d2440c2..910ba91457 100644 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -49,6 +49,7 @@ from .tokenization_distilbert import DistilBertTokenizer from .tokenization_albert import AlbertTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_t5 import T5Tokenizer +from .tokenization_xlm_roberta import XLMRobertaTokenizer # Configurations from .configuration_utils import PretrainedConfig @@ -65,6 +66,7 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP # Modeling if is_torch_available(): @@ -119,6 +121,9 @@ if is_torch_available(): AlbertForQuestionAnswering, load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP) + from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice, + XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification) + # Optimization from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup, get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup) From 9ed09cb4a31518b13f2c58c057e43e029c32611a Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 16:46:58 +0100 Subject: [PATCH 169/302] converter: add conversion script for original XLM-RoBERTa weights to Transformers-compatible weights --- ..._original_pytorch_checkpoint_to_pytorch.py | 184 ++++++++++++++++++ 1 file changed, 184 insertions(+) create mode 100644 transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000..888adf4819 --- /dev/null +++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,184 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert RoBERTa checkpoint.""" + +from __future__ import absolute_import, division, print_function + +import argparse +import logging +import numpy as np +import torch +import pathlib + +from fairseq.models.roberta import RobertaModel as FairseqRobertaModel +from fairseq.modules import TransformerSentenceEncoderLayer +from transformers.modeling_bert import (BertConfig, BertEncoder, + BertIntermediate, BertLayer, + BertModel, BertOutput, + BertSelfAttention, + BertSelfOutput) +from transformers.modeling_roberta import (RobertaEmbeddings, + RobertaForMaskedLM, + RobertaForSequenceClassification, + RobertaModel) + +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +SAMPLE_TEXT = 'Hello world! cécé herlolip' + + +def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): + """ + Copy/paste/tweak roberta's weights to our BERT structure. + """ + roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece') + roberta.eval() # disable dropout + config = BertConfig( + vocab_size_or_config_json_file=250004, + hidden_size=roberta.args.encoder_embed_dim, + num_hidden_layers=roberta.args.encoder_layers, + num_attention_heads=roberta.args.encoder_attention_heads, + intermediate_size=roberta.args.encoder_ffn_embed_dim, + max_position_embeddings=514, + type_vocab_size=1, + layer_norm_eps=1e-5, # PyTorch default used in fairseq + ) + if classification_head: + config.num_labels = roberta.args.num_classes + print("Our BERT config:", config) + + model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) + model.eval() + + # Now let's copy all the weights. + # Embeddings + roberta_sent_encoder = roberta.model.decoder.sentence_encoder + model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight + model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight + model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. + model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight + model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias + + for i in range(config.num_hidden_layers): + # Encoder: start of layer + layer: BertLayer = model.roberta.encoder.layer[i] + roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] + + ### self attention + self_attn: BertSelfAttention = layer.attention.self + assert( + roberta_layer.self_attn.k_proj.weight.data.shape == \ + roberta_layer.self_attn.q_proj.weight.data.shape == \ + roberta_layer.self_attn.v_proj.weight.data.shape == \ + torch.Size((config.hidden_size, config.hidden_size)) + ) + + self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight + self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias + self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight + self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias + self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight + self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias + + ### self-attention output + self_output: BertSelfOutput = layer.attention.output + assert( + self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape + ) + self_output.dense.weight = roberta_layer.self_attn.out_proj.weight + self_output.dense.bias = roberta_layer.self_attn.out_proj.bias + self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight + self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias + + ### intermediate + intermediate: BertIntermediate = layer.intermediate + assert( + intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape + ) + intermediate.dense.weight = roberta_layer.fc1.weight + intermediate.dense.bias = roberta_layer.fc1.bias + + ### output + bert_output: BertOutput = layer.output + assert( + bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape + ) + bert_output.dense.weight = roberta_layer.fc2.weight + bert_output.dense.bias = roberta_layer.fc2.bias + bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight + bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias + #### end of layer + + if classification_head: + model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight + model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias + model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight + model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias + else: + # LM Head + model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight + model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias + model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight + model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias + model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight + model.lm_head.bias = roberta.model.decoder.lm_head.bias + + # Let's check that we get the same results. + input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 + + our_output = model(input_ids)[0] + if classification_head: + their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids)) + else: + their_output = roberta.model(input_ids)[0] + print(our_output.shape, their_output.shape) + max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() + print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 + success = torch.allclose(our_output, their_output, atol=1e-3) + print( + "Do both models output the same tensors?", + "🔥" if success else "💩" + ) + if not success: + raise Exception("Something went wRoNg") + + pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ## Required parameters + parser.add_argument("--roberta_checkpoint_path", + default = None, + type = str, + required = True, + help = "Path the official PyTorch dump.") + parser.add_argument("--pytorch_dump_folder_path", + default = None, + type = str, + required = True, + help = "Path to the output PyTorch model.") + parser.add_argument("--classification_head", + action = "store_true", + help = "Whether to convert a final classification head.") + args = parser.parse_args() + convert_roberta_checkpoint_to_pytorch( + args.roberta_checkpoint_path, + args.pytorch_dump_folder_path, + args.classification_head + ) From a648ff738c88a41cfae4f915a4391c7d66261b64 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 16:47:39 +0100 Subject: [PATCH 170/302] configuration: add support for XLM-RoBERTa model --- transformers/configuration_xlm_roberta.py | 33 +++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 transformers/configuration_xlm_roberta.py diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py new file mode 100644 index 0000000000..1633cc18aa --- /dev/null +++ b/transformers/configuration_xlm_roberta.py @@ -0,0 +1,33 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" XLM-RoBERTa configuration """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +from .configuration_roberta import RobertaConfig + +logger = logging.getLogger(__name__) + +XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { + 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json", +} + + +class XLMRobertaConfig(RobertaConfig): + pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP From 69f4f058fa5ecc6fea8c65ae59694442bba795e6 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 17:00:12 +0100 Subject: [PATCH 171/302] model: add support for new XLM-RoBERTa model --- transformers/modeling_xlm_roberta.py | 293 +++++++++++++++++++++++++++ 1 file changed, 293 insertions(+) create mode 100644 transformers/modeling_xlm_roberta.py diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py new file mode 100644 index 0000000000..8402be4b5c --- /dev/null +++ b/transformers/modeling_xlm_roberta.py @@ -0,0 +1,293 @@ +# coding=utf-8 +# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch XLM-RoBERTa model. """ + +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging + +from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification +from .configuration_xlm_roberta import XLMRobertaConfig +from .file_utils import add_start_docstrings + +logger = logging.getLogger(__name__) + +XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin", +} + + +XLM_ROBERTA_START_DOCSTRING = r""" The XLM-RoBERTa model was proposed in + `Unsupervised Cross-lingual Representation Learning at Scale`_ + by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019. + + It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data. + + This implementation is the same as RoBERTa. + + This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and + refer to the PyTorch documentation for all matter related to general usage and behavior. + + .. _`Unsupervised Cross-lingual Representation Learning at Scale`: + https://arxiv.org/abs/1911.02116 + + .. _`torch.nn.Module`: + https://pytorch.org/docs/stable/nn.html#module + + Parameters: + config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the configuration. + Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights. +""" + +XLM_ROBERTA_INPUTS_DOCSTRING = r""" + Inputs: + **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of input sequence tokens in the vocabulary. + To match pre-training, XLM-RoBERTa input sequence should be formatted with and tokens as follows: + + (a) For sequence pairs: + + ``tokens: Is this Jacksonville ? No it is not . `` + + (b) For single sequences: + + ``tokens: the dog is hairy . `` + + Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with + the ``add_special_tokens`` parameter set to ``True``. + + XLM-RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on + the right rather than the left. + + See :func:`transformers.PreTrainedTokenizer.encode` and + :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details. + **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``: + Mask to avoid performing attention on padding token indices. + Mask values selected in ``[0, 1]``: + ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens. + **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Optional segment token indices to indicate first and second portions of the inputs. + This embedding matrice is not trained (not pretrained during XLM-RoBERTa pretraining), you will have to train it + during finetuning. + Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1`` + corresponds to a `sentence B` token + (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details). + **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Indices of positions of each input sequence tokens in the position embeddings. + Selected in the range ``[0, config.max_position_embeddings - 1[``. + **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``: + Mask to nullify selected heads of the self-attention modules. + Mask values selected in ``[0, 1]``: + ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**. + **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``: + Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. +""" + +@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.", + XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +class XLMRobertaModel(RobertaModel): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)`` + Sequence of hidden-states at the output of the last layer of the model. + **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)`` + Last layer hidden-state of the first token of the sequence (classification token) + further processed by a Linear layer and a Tanh activation function. The Linear + layer weights are trained from the next sentence prediction (classification) + eo match pre-training, XLM-RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows: + + (a) For sequence pairs: + + ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` + + (b) For single sequences: + + ``tokens: [CLS] the dog is hairy . [SEP]`` + + ``token_type_ids: 0 0 0 0 0 0 0`` + + objective during Bert pretraining. This output is usually *not* a good summary + of the semantic content of the input, you're often better with averaging or pooling + the sequence of hidden-states for the whole input sequence. + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') + model = XLMRobertaModel.from_pretrained('xlm-roberta-large') + input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids) + last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple + + """ + config_class = XLMRobertaConfig + pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + + +@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """, + XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +class XLMRobertaForMaskedLM(RobertaForMaskedLM): + r""" + **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the masked language modeling loss. + Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring) + Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels + in ``[0, ..., config.vocab_size]`` + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Masked language modeling loss. + **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)`` + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') + model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-large') + input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, masked_lm_labels=input_ids) + loss, prediction_scores = outputs[:2] + + """ + config_class = XLMRobertaConfig + pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + + +@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer + on top of the pooled output) e.g. for GLUE tasks. """, + XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +class XLMRobertaForSequenceClassification(RobertaForSequenceClassification): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``: + Labels for computing the sequence classification/regression loss. + Indices should be in ``[0, ..., config.num_labels]``. + If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss), + If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy). + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification (or regression if config.num_labels==1) loss. + **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)`` + Classification (or regression if config.num_labels==1) scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') + model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large') + input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1]).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, logits = outputs[:2] + + """ + config_class = XLMRobertaConfig + pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + + +@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of + the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """, + XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +class XLMRobertaForMultipleChoice(RobertaForMultipleChoice): + r""" + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension + of the input tensors. (see `input_ids` above). + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') + model = XLMRobertaForMultipleChoice.from_pretrained('xlm-roberta-large') + choices = ["Schloß Nymphenburg ist sehr schön .", "Der Schloßkanal auch !"] + input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0) # Batch size 1, 2 choices + labels = torch.tensor(1).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, classification_scores = outputs[:2] + + """ + config_class = XLMRobertaConfig + pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP + + +@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of + the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """, + XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING) +class XLMRobertaForTokenClassification(RobertaForTokenClassification): + r""" + **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``: + Labels for computing the token classification loss. + Indices should be in ``[0, ..., config.num_labels - 1]``. + + Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs: + **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``: + Classification loss. + **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)`` + Classification scores (before SoftMax). + **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``) + list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings) + of shape ``(batch_size, sequence_length, hidden_size)``: + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + **attentions**: (`optional`, returned when ``config.output_attentions=True``) + list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. + + Examples:: + + tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large') + model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-large') + input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .", add_special_tokens=True)).unsqueeze(0) # Batch size 1 + labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0) # Batch size 1 + outputs = model(input_ids, labels=labels) + loss, scores = outputs[:2] + + """ + config_class = XLMRobertaConfig + pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP From 59a1aefb1ca51b183bffa2d355bc2a22a7c51274 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 17:00:55 +0100 Subject: [PATCH 172/302] tokenization: add support for new XLM-RoBERTa model. Add wrapper around fairseq tokenization logic --- transformers/tokenization_xlm_roberta.py | 154 +++++++++++++++++++++++ 1 file changed, 154 insertions(+) create mode 100644 transformers/tokenization_xlm_roberta.py diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py new file mode 100644 index 0000000000..0f95397606 --- /dev/null +++ b/transformers/tokenization_xlm_roberta.py @@ -0,0 +1,154 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +""" Tokenization classes for XLM-RoBERTa model.""" +from __future__ import (absolute_import, division, print_function, + unicode_literals) + +import logging +import os +from shutil import copyfile + +import sentencepiece as spm +from transformers.tokenization_utils import PreTrainedTokenizer + +logger = logging.getLogger(__name__) + +VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} + +PRETRAINED_VOCAB_FILES_MAP = { + 'vocab_file': + { + 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model", + } +} + +PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'xlm-roberta-large': None, +} + +class XLMRobertaTokenizer(PreTrainedTokenizer): + """ + Adapted from RobertaTokenizer and XLNetTokenizer + SentencePiece based tokenizer. Peculiarities: + + - requires `SentencePiece `_ + """ + vocab_files_names = VOCAB_FILES_NAMES + pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP + max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + + def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", + cls_token="", unk_token="", pad_token='', mask_token='', + **kwargs): + super(XLMRobertaTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, + sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, + mask_token=mask_token, + **kwargs) + self.max_len_single_sentence = self.max_len - 2 # take into account special tokens + self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens + self.sp_model = spm.SentencePieceProcessor() + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2} + self.fairseq_tokens_to_ids[''] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks + by concatenating and adding special tokens. + A RoBERTa sequence has the following format: + single sequence: X + pair of sequences: A B + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods. + + Args: + token_ids_0: list of ids (must not contain special tokens) + token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids + for sequence pairs + already_has_special_tokens: (default False) Set to True if the token list is already formated with + special tokens for the model + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError("You should not supply a second sequence if the provided sequence of " + "ids is already formated with special tokens for the model.") + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. + A RoBERTa sequence pair mask has the following format: + 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence + + if token_ids_1 is None, only returns the first portion of the mask (0's). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1] + + @property + def vocab_size(self): + return len(self.sp_model) + len(self.fairseq_tokens_to_ids) + + def _tokenize(self, text): + return self.sp_model.EncodeAsPieces(text) + + def _convert_token_to_id(self, token): + """ Converts a token (str/unicode) in an id using the vocab. """ + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + return self.sp_model.PieceToId(token) + 1 + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (string/unicode) using the vocab.""" + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index + 1) + + def save_vocabulary(self, save_directory): + """ Save the sentencepiece vocabulary (copy original file) and special tokens file + to a directory. + """ + if not os.path.isdir(save_directory): + logger.error("Vocabulary path ({}) should be a directory".format(save_directory)) + return + out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file']) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) From a701a0cee1ae6cb7b93b047cc3ffc06b01157955 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 17:17:56 +0100 Subject: [PATCH 173/302] configuration: fix model name for large XLM-RoBERTa model --- transformers/configuration_xlm_roberta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py index 1633cc18aa..dd03572976 100644 --- a/transformers/configuration_xlm_roberta.py +++ b/transformers/configuration_xlm_roberta.py @@ -25,7 +25,7 @@ from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json", + 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json", } From d064009b72c4a29cd66b6c633dcd8c3ad5ab6dca Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Mon, 16 Dec 2019 17:23:25 +0100 Subject: [PATCH 174/302] converter: fix vocab size --- ...onvert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py index 888adf4819..6873f1d0f0 100644 --- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -47,7 +47,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece') roberta.eval() # disable dropout config = BertConfig( - vocab_size_or_config_json_file=250004, + vocab_size_or_config_json_file=250002, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, From 855ff0e91d8b3bd75a3b1c1316e2efd814373764 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 16 Dec 2019 12:42:22 -0500 Subject: [PATCH 175/302] [doc] Model upload and sharing ping @lysandrejik @thomwolf Is this clear enough? Anything we should add? --- README.md | 41 ++++++++++++++++++++++++++++++++++++ docs/source/index.rst | 1 + docs/source/model_sharing.md | 40 +++++++++++++++++++++++++++++++++++ 3 files changed, 82 insertions(+) create mode 100644 docs/source/model_sharing.md diff --git a/README.md b/README.md index 214f61cc0c..a5ae74a9ae 100644 --- a/README.md +++ b/README.md @@ -56,6 +56,7 @@ Choose the right framework for every part of a model's lifetime | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 | | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | +| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers | | [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | @@ -446,6 +447,46 @@ python ./examples/run_generation.py \ --repetition_penalty=1.2 \ ``` +## Quick tour of model sharing + +New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library. + +**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then: + +```shell +transformers-cli login +# log in using the same credentials as on huggingface.co +``` +Upload your model: +```shell +transformers-cli upload ./path/to/pretrained_model/ + +# ^^ Upload folder containing weights/tokenizer/config +# saved via `.save_pretrained()` + +transformers-cli upload ./config.json [--filename foobar.json] + +# ^^ Upload a single file +# (you can optionally override its filename) +``` + +Your model will then be accessible through its identifier: +```python +"username/model_name" +``` + +Anyone can load it from code: +```python +tokenizer = AutoTokenizer.from_pretrained("username/model_name") +model = AutoModel.from_pretrained("username/model_name") +``` + +Finally, list all your files on S3: +```shell +transformers-cli ls +# List all your S3 objects. +``` + ## Migrating from pytorch-transformers to transformers Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`. diff --git a/docs/source/index.rst b/docs/source/index.rst index 84012fc6cf..48282c1c6c 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -58,6 +58,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train installation quickstart pretrained_models + model_sharing examples notebooks serialization diff --git a/docs/source/model_sharing.md b/docs/source/model_sharing.md new file mode 100644 index 0000000000..b9c722b10f --- /dev/null +++ b/docs/source/model_sharing.md @@ -0,0 +1,40 @@ +# Model upload and sharing + +Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the CLI that's built-in to the library. + +**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then: + +```shell +transformers-cli login +# log in using the same credentials as on huggingface.co +``` +Upload your model: +```shell +transformers-cli upload ./path/to/pretrained_model/ + +# ^^ Upload folder containing weights/tokenizer/config +# saved via `.save_pretrained()` + +transformers-cli upload ./config.json [--filename foobar.json] + +# ^^ Upload a single file +# (you can optionally override its filename) +``` + +Your model will then be accessible through its identifier: +```python +"username/model_name" +``` + +Anyone can load it from code: +```python +tokenizer = AutoTokenizer.from_pretrained("username/model_name") +model = AutoModel.from_pretrained("username/model_name") +``` + +Finally, list all your files on S3: +```shell +transformers-cli ls +# List all your S3 objects. +``` + From a468870fd27c601e3717c5b9ca691e18a8c7227f Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 22:22:30 +0100 Subject: [PATCH 176/302] refactoring generation --- transformers/configuration_utils.py | 11 + transformers/modeling_utils.py | 429 +++++++++++++--------------- 2 files changed, 213 insertions(+), 227 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 08cee75d81..9c3360892d 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -57,8 +57,19 @@ class PretrainedConfig(object): self.torchscript = kwargs.pop('torchscript', False) # Only used by PyTorch models self.use_bfloat16 = kwargs.pop('use_bfloat16', False) self.pruned_heads = kwargs.pop('pruned_heads', {}) + + # Is decoder is used in encoder-decoder models to differentiate encoder from decoder self.is_decoder = kwargs.pop('is_decoder', False) + # Parameters for sequence generation + self.generate_length = kwargs.pop('generate_length', 10) + self.generate_do_sample = kwargs.pop('generate_do_sample', False) + self.generate_num_beams = kwargs.pop('generate_num_beams', 1) + self.generate_temperature = kwargs.pop('generate_temperature', 1.0) + self.generate_top_k = kwargs.pop('generate_top_k', 50) + self.generate_top_p = kwargs.pop('generate_top_p', 0.0) + self.generate_repetition_penalty = kwargs.pop('generate_repetition_penalty', 1.0) + def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method. diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 74038351fd..27d42c552a 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -82,6 +82,7 @@ class PreTrainedModel(nn.Module): "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ )) + # Save config in model self.config = config @@ -89,93 +90,6 @@ class PreTrainedModel(nn.Module): def base_model(self): return getattr(self, self.base_model_prefix, self) - def decode(self, - prompt_ids=None, - device=torch.device('cpu'), - length=10, - do_sample=False, - temperature=1., - k=9, - p=0, - repetition_penalty=1, - **model_kwargs): - """ Generic sequence generator for single-stack models with a LM head. - - The method currently supports greedy decoding and sampling. See the - documentation of the `Sampler` class for more information about the - parameters related to sampling. - - Params: - **encoder_input_ids**: `torch.LongTensor` of shape (1, sequence_length) - The sequence to encode. - **decoder_prompt_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) - The sequence used as a prompt for the generation. If `None` the method initializes - it as an empty `torch.LongTensor` of shape (1,) - **device**: (`optional`) `torch.device` - The device on which the prompt_ids will be initialized if not provided. - **length**: (`optional`) int - The length of the sequence to be generated. - **do_sample**: (`optional`) bool - If set to `False` we use greedy decoding; otherwise sampling. - **temperature**: (`optional`) float - The value used to module the next token probabilities. - **k**: (`optional`) int - The parameter used for k-filtering. - **p**: (`optional`) float - The parameter for nucleus sampling. Must be between 0 and 1. - **repetition_penalty**: (`optional`) float - The parameter for repetition penalty. - """ - - if prompt_ids is None: - prompt_ids = torch.tensor([[]], dtype=torch.long, device=device) - - # When the model does not have a LM head `get_output_embeddings` - # returns `None`. We use this mechanism to determine whether we - # should proceed with decoding or not. - if self.get_output_embeddings() is None: - raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") - - # The followings checks that the model is on the same device as the one - # that is specified. It only works for models that fit on one GPU. - model_device = next(self.parameters()).device - if model_device != prompt_ids.device: - warnings.warn( - "The model is not on the same device as the prompts. Expected {}, got {}.".format( - prompt_ids.device, model_device - ) - ) - - sampler_config = { - "k": k, - "p": p, - "do_sample": do_sample, - "temperature": temperature, - "repetition_penalty": repetition_penalty, - } - return self._greedy_decode_or_sample(prompt_ids, length, sampler_config, **model_kwargs) - - def _greedy_decode_or_sample(self, prompt_ids, length, sampler_config, **model_kwargs): - """ Generate text using greedy decoding or by sampling tokens.""" - sampler = Sampler(**sampler_config) - generated_sequence = prompt_ids - with torch.no_grad(): - for _ in trange(length): - arguments = self._prepare_inputs_for_decoding(generated_sequence, **model_kwargs) - outputs = self(**arguments) - next_tokens_logits = outputs[0][:, -1, :] - next_tokens = sampler.get_one_token( - next_tokens_logits, generated_sequence - ) - generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) - - return generated_sequence.squeeze(0) - - def _prepare_inputs_for_decoding(self, input_ids, **kwargs): - arguments = {"input_ids": input_ids} - arguments.update(kwargs) - return arguments - def get_input_embeddings(self): """ Get model's input embeddings """ @@ -306,6 +220,9 @@ class PreTrainedModel(nn.Module): # Tie weights if needed self.tie_weights() + # Initialize decoding head if we have output embeddings + + def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. @@ -571,6 +488,204 @@ class PreTrainedModel(nn.Module): return model + def generate(self, input_ids=None, length=None, do_sample=False, num_beams=None, + temperature=None, top_k=None, top_p=None, repetition_penalty=None, + **model_kwargs): + """ Generic sequence generator for single-stack models with a LM head. + + The method currently supports greedy decoding and sampling. See the + documentation of the `Sampler` class for more information about the + parameters related to sampling. + + Params: + **input_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) + The sequence used as a prompt for the generation. If `None` the method initializes + it as an empty `torch.LongTensor` of shape (1,) + **length**: (`optional`) int + The length of the sequence to be generated. + **do_sample**: (`optional`) bool + If set to `False` we use greedy decoding; otherwise sampling. + **temperature**: (`optional`) float + The value used to module the next token probabilities. + **k**: (`optional`) int + The parameter used for k-filtering. + **p**: (`optional`) float + The parameter for nucleus sampling. Must be between 0 and 1. + **repetition_penalty**: (`optional`) float + The parameter for repetition penalty. + """ + + if input_ids is None: + input_ids = torch.tensor([[]], dtype=torch.long, device=next(self.parameters()).device) + + # We cannot generate if the model does not have a LM head + if self.get_output_embeddings() is None: + raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") + + sampler_config = { + "k": k, + "p": p, + "do_sample": do_sample, + "temperature": temperature, + "repetition_penalty": repetition_penalty, + } + + sampler = Sampler(**sampler_config) + generated_sequence = input_ids + for _ in trange(length): + arguments = self._prepare_inputs_for_decoding(generated_sequence, **model_kwargs) + outputs = self(**arguments) + next_tokens_logits = outputs[0][:, -1, :] + next_tokens = sampler.get_one_token( + next_tokens_logits, generated_sequence + ) + generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) + + return generated_sequence.squeeze(0) + + def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): + return model_kwargs.update({"input_ids": input_ids}) + + +class Sampler(object): + r""" Sampler is used to generate sequences of ids from logit inputs. + + Greedy decoding, which consists in chosing the most probable token at each + step, is the default behaviour. Sampling with varying temperature, top_k + and nucleus filtering is also implemented. + + Attributes: + **device**: ``torch.device`` + Device on which the computations will be run. + **do_sample**: bool + Whether to sample or do greedy decoding. + **k**: int between 0 and vocab_size + Parameter for the top-k filtering + **p**: float between 0 and 1 + Parameter for the nucleus filtering + **temperature**: strictly positive float + Parameter used to modulate the distribution over ids. Low temperatures + put more emphasis on highly probably token while high temperatures tend + to smooth the probability distribution. + **repetition_penalty**: strictly postitive float + The penalty applied to repeating ids + """ + + def __init__( + self, do_sample=False, k=9, p=0.0, temperature=1.0, repetition_penalty=1.0 + ): + self.k = k + self.p = p + self.do_sample = do_sample + self.temperature = temperature + self.repetition_penalty = repetition_penalty + + self.do_apply_repetition_penalty = True if repetition_penalty > 1 else False + + if self.p > 1: + warnings.warn( + """You are trying to apply nucleus filtering with a value of p greater than 1 ({}). + However p is a probability and its value must lie between 0 and 1. In effect, no filtering + will be applied. If this is not the behavior you expect, change the value of p.""".format( + self.p + ) + ) + + def get_one_token(self, next_token_logits, past_sequence): + logits = self.apply_repetition_penalty(next_token_logits, past_sequence) + if self.do_sample: + logits = self.apply_temperature(logits) + logits = self.apply_top_k_filter(logits) + logits = self.apply_nucleus_filter(logits) + return torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) + return torch.argmax(logits, dim=-1).unsqueeze(-1) + + def apply_repetition_penalty(self, logits, past_sequence): + """ Apply a penalty to tokens that appear more than once in the + generated sequence. + + .. Keskar, Nitish Shirish, et al. "Ctrl: A conditional transformer + language model for controllable generation." arXiv preprint + arXiv:1909.05858 (2019). + """ + if self.do_apply_repetition_penalty: + generated_token_idx = set(past_sequence[0].tolist()) + for token_idx in generated_token_idx: + logits[0, token_idx] /= self.repetition_penalty + return logits + + def apply_temperature(self, logits): + """ Shape the tokens' distribution through temperature. The higher the value + of the temperature, the more skewed towards high probability events the + distribution is. + + .. Goodfellow, Ian, Yoshua Bengio, and Aaron Courville. Deep learning. + MIT press, 2016. + """ + # when dividing a float by 0, torch returns inf which in turns breaks the + # multinomial with an error message that is not very helpful. It is better + # for the user to break the execution and explain why. + if self.temperature == 0: + raise ZeroDivisionError( + """You are trying to sample with a temperature equal to 0. + If you wanted to do greedy sampling, set instead `do_sample` to False. + Otherwise set the temperature to a value different from 0.""" + ) + return logits / self.temperature + + def apply_top_k_filter(self, logits): + """ Use the probability distribution of the tokens to determine the set + to be sampled from. Specifically we select the set of size k such that + the sum of its items' probabilities is maximum. + + .. Fan, Angela, Mike Lewis, and Yann Dauphin. "Hierarchical neural + story generation." arXiv preprint arXiv:1805.04833 (2018). + """ + if self.k > 0: + vocabulary_size = logits.size(-1) + if self.k > vocabulary_size: + warnings.warn( + """You provided a value for k ({}) that is larger than the vocabulary size ({}). + We adjusted k's value to the vocabulary size; if that was what you intended to do + we recommend setting k to 0 instead. It this is not the behavior you expected, + choose a value of k that is smaller than the vocabulary size.""".format( + self.k, vocabulary_size + ) + ) + self.k = vocabulary_size + + indices_to_remove = logits < torch.topk(logits, self.k)[0][..., -1, None] + logits[indices_to_remove] = -float("Inf") + + return logits + + def apply_nucleus_filter(self, logits): + """ Use the probability distribution of the tokens to determine the set + to be sampled from. Specifically, choose the smallest set such that the + sum of its items' probabilities is greater than a number p in [0,1]. + + .. Holtzman, Ari, et al. "The curious case of neural text + degeneration." arXiv preprint arXiv:1904.09751 (2019). + """ + if self.p > 0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + sorted_probabilities = F.softmax(sorted_logits, dim=-1) + cumulative_probabilities = torch.cumsum(sorted_probabilities, dim=-1) + + # Remove tokens with cumulative probability above the threshold, + # but keep the first token above the threshold. + sorted_indices_to_remove = cumulative_probabilities > self.p + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter( + dim=-1, index=sorted_indices, src=sorted_indices_to_remove + ) + logits[indices_to_remove] = -float("Inf") + + return logits + class Conv1D(nn.Module): def __init__(self, nf, nx): @@ -948,143 +1063,3 @@ def prune_layer(layer, index, dim=None): return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) else: raise ValueError("Can't prune layer of class {}".format(layer.__class__)) - - -class Sampler(object): - r""" Sampler is used to generate sequences of ids from logit inputs. - - Greedy decoding, which consists in chosing the most probable token at each - step, is the default behaviour. Sampling with varying temperature, top_k - and nucleus filtering is also implemented. - - Attributes: - **device**: ``torch.device`` - Device on which the computations will be run. - **do_sample**: bool - Whether to sample or do greedy decoding. - **k**: int between 0 and vocab_size - Parameter for the top-k filtering - **p**: float between 0 and 1 - Parameter for the nucleus filtering - **temperature**: strictly positive float - Parameter used to modulate the distribution over ids. Low temperatures - put more emphasis on highly probably token while high temperatures tend - to smooth the probability distribution. - **repetition_penalty**: strictly postitive float - The penalty applied to repeating ids - """ - - def __init__( - self, do_sample=False, k=9, p=0.0, temperature=1.0, repetition_penalty=1.0 - ): - self.k = k - self.p = p - self.do_sample = do_sample - self.temperature = temperature - self.repetition_penalty = repetition_penalty - - self.do_apply_repetition_penalty = True if repetition_penalty > 1 else False - - if self.p > 1: - warnings.warn( - """You are trying to apply nucleus filtering with a value of p greater than 1 ({}). - However p is a probability and its value must lie between 0 and 1. In effect, no filtering - will be applied. If this is not the behavior you expect, change the value of p.""".format( - self.p - ) - ) - - def get_one_token(self, next_token_logits, past_sequence): - logits = self.apply_repetition_penalty(next_token_logits, past_sequence) - if self.do_sample: - logits = self.apply_temperature(logits) - logits = self.apply_top_k_filter(logits) - logits = self.apply_nucleus_filter(logits) - return torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) - return torch.argmax(logits, dim=-1).unsqueeze(-1) - - def apply_repetition_penalty(self, logits, past_sequence): - """ Apply a penalty to tokens that appear more than once in the - generated sequence. - - .. Keskar, Nitish Shirish, et al. "Ctrl: A conditional transformer - language model for controllable generation." arXiv preprint - arXiv:1909.05858 (2019). - """ - if self.do_apply_repetition_penalty: - generated_token_idx = set(past_sequence[0].tolist()) - for token_idx in generated_token_idx: - logits[0, token_idx] /= self.repetition_penalty - return logits - - def apply_temperature(self, logits): - """ Shape the tokens' distribution through temperature. The higher the value - of the temperature, the more skewed towards high probability events the - distribution is. - - .. Goodfellow, Ian, Yoshua Bengio, and Aaron Courville. Deep learning. - MIT press, 2016. - """ - # when dividing a float by 0, torch returns inf which in turns breaks the - # multinomial with an error message that is not very helpful. It is better - # for the user to break the execution and explain why. - if self.temperature == 0: - raise ZeroDivisionError( - """You are trying to sample with a temperature equal to 0. - If you wanted to do greedy sampling, set instead `do_sample` to False. - Otherwise set the temperature to a value different from 0.""" - ) - return logits / self.temperature - - def apply_top_k_filter(self, logits): - """ Use the probability distribution of the tokens to determine the set - to be sampled from. Specifically we select the set of size k such that - the sum of its items' probabilities is maximum. - - .. Fan, Angela, Mike Lewis, and Yann Dauphin. "Hierarchical neural - story generation." arXiv preprint arXiv:1805.04833 (2018). - """ - if self.k > 0: - vocabulary_size = logits.size(-1) - if self.k > vocabulary_size: - warnings.warn( - """You provided a value for k ({}) that is larger than the vocabulary size ({}). - We adjusted k's value to the vocabulary size; if that was what you intended to do - we recommend setting k to 0 instead. It this is not the behavior you expected, - choose a value of k that is smaller than the vocabulary size.""".format( - self.k, vocabulary_size - ) - ) - self.k = vocabulary_size - - indices_to_remove = logits < torch.topk(logits, self.k)[0][..., -1, None] - logits[indices_to_remove] = -float("Inf") - - return logits - - def apply_nucleus_filter(self, logits): - """ Use the probability distribution of the tokens to determine the set - to be sampled from. Specifically, choose the smallest set such that the - sum of its items' probabilities is greater than a number p in [0,1]. - - .. Holtzman, Ari, et al. "The curious case of neural text - degeneration." arXiv preprint arXiv:1904.09751 (2019). - """ - if self.p > 0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - sorted_probabilities = F.softmax(sorted_logits, dim=-1) - cumulative_probabilities = torch.cumsum(sorted_probabilities, dim=-1) - - # Remove tokens with cumulative probability above the threshold, - # but keep the first token above the threshold. - sorted_indices_to_remove = cumulative_probabilities > self.p - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter( - dim=-1, index=sorted_indices, src=sorted_indices_to_remove - ) - logits[indices_to_remove] = -float("Inf") - - return logits From d8034092153a6850052862f154a398b88b8ba4e5 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 16 Dec 2019 16:31:38 -0500 Subject: [PATCH 177/302] Fix run squad evaluate during training --- examples/run_squad.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index a39915ee8b..34c31c3bb8 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -223,7 +223,7 @@ def evaluate(args, model, tokenizer, prefix=""): eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size) # multi-gpu evaluate - if args.n_gpu > 1: + if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel): model = torch.nn.DataParallel(model) # Eval! From 18a879f47576822aa1a5c49aecb27d89bfa5fa69 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Mon, 16 Dec 2019 16:44:29 -0500 Subject: [PATCH 178/302] fix #2180 --- examples/run_generation.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/examples/run_generation.py b/examples/run_generation.py index 2d917660cf..fa52905b7e 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -247,7 +247,11 @@ def main(): out = out[:, len(context_tokens):].tolist() for o in out: text = tokenizer.decode(o, clean_up_tokenization_spaces=True) - text = text[: text.find(args.stop_token) if args.stop_token else None] + if args.stop_token: + index = text.find(args.stop_token) + if index == -1: + index = None + text = text[:index] print(text) From 3cb51299c371f67b4da40b89c59c63e9405591f0 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Mon, 16 Dec 2019 22:32:05 +0100 Subject: [PATCH 179/302] Fix #2109 --- transformers/modeling_tf_pytorch_utils.py | 13 +++++++++++-- transformers/modeling_tf_utils.py | 2 +- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index 9d2b663dcb..d885fd23b3 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -143,7 +143,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove) # Find associated numpy array in pytorch model state dict - assert name in pt_state_dict, "{} not found in PyTorch model".format(name) + if name not in pt_state_dict: + if allow_missing_keys: + continue + raise AttributeError("{} not found in PyTorch model".format(name)) + array = pt_state_dict[name].numpy() if transpose: @@ -250,6 +254,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F all_tf_weights = set(list(tf_weights_map.keys())) loaded_pt_weights_data_ptr = {} + missing_keys_pt = [] for pt_weight_name, pt_weight in current_pt_params_dict.items(): # Handle PyTorch shared weight ()not duplicated in TF 2.0 if pt_weight.data_ptr() in loaded_pt_weights_data_ptr: @@ -258,7 +263,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F # Find associated numpy array in pytorch model state dict if pt_weight_name not in tf_weights_map: - raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name)) + if allow_missing_keys: + missing_keys_pt.append(pt_weight_name) + continue + raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name)) array, transpose = tf_weights_map[pt_weight_name] @@ -283,6 +291,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F all_tf_weights.discard(pt_weight_name) missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False) + missing_keys += missing_keys_pt if len(missing_keys) > 0: logger.info("Weights of {} not initialized from TF 2.0 model: {}".format( diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 6fb4850b05..6bbec71cdf 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -297,7 +297,7 @@ class TFPreTrainedModel(tf.keras.Model): if from_pt: # Load from a PyTorch checkpoint - return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file) + return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True) ret = model(model.dummy_inputs, training=False) # build the network with dummy inputs From 3f5ccb183e3cfa755dea2dd2afd9abbf1a0f93b8 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 16 Dec 2019 18:20:23 -0500 Subject: [PATCH 180/302] [doc] Clarify uploads cf https://github.com/huggingface/transformers/commit/855ff0e91d8b3bd75a3b1c1316e2efd814373764#commitcomment-36452545 --- README.md | 10 +++++----- docs/source/model_sharing.md | 12 ++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index a5ae74a9ae..c33a65bdbb 100644 --- a/README.md +++ b/README.md @@ -464,21 +464,21 @@ transformers-cli upload ./path/to/pretrained_model/ # ^^ Upload folder containing weights/tokenizer/config # saved via `.save_pretrained()` -transformers-cli upload ./config.json [--filename foobar.json] +transformers-cli upload ./config.json [--filename folder/foobar.json] # ^^ Upload a single file -# (you can optionally override its filename) +# (you can optionally override its filename, which can be nested inside a folder) ``` -Your model will then be accessible through its identifier: +Your model will then be accessible through its identifier, a concatenation of your username and the folder name above: ```python "username/model_name" ``` Anyone can load it from code: ```python -tokenizer = AutoTokenizer.from_pretrained("username/model_name") -model = AutoModel.from_pretrained("username/model_name") +tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model") +model = AutoModel.from_pretrained("username/pretrained_model") ``` Finally, list all your files on S3: diff --git a/docs/source/model_sharing.md b/docs/source/model_sharing.md index b9c722b10f..95baafb575 100644 --- a/docs/source/model_sharing.md +++ b/docs/source/model_sharing.md @@ -15,21 +15,21 @@ transformers-cli upload ./path/to/pretrained_model/ # ^^ Upload folder containing weights/tokenizer/config # saved via `.save_pretrained()` -transformers-cli upload ./config.json [--filename foobar.json] +transformers-cli upload ./config.json [--filename folder/foobar.json] # ^^ Upload a single file -# (you can optionally override its filename) +# (you can optionally override its filename, which can be nested inside a folder) ``` -Your model will then be accessible through its identifier: +Your model will then be accessible through its identifier, a concatenation of your username and the folder name above: ```python -"username/model_name" +"username/pretrained_model" ``` Anyone can load it from code: ```python -tokenizer = AutoTokenizer.from_pretrained("username/model_name") -model = AutoModel.from_pretrained("username/model_name") +tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model") +model = AutoModel.from_pretrained("username/pretrained_model") ``` Finally, list all your files on S3: From 3c6efd0ca367063b8b3883020b54aa22fc4abb27 Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:18:12 +0800 Subject: [PATCH 181/302] updated usage example in modeling_roberta for question and answering --- transformers/modeling_roberta.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 2f6f634fa6..ea9211cbb9 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -585,13 +585,16 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``: Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads. Examples:: - tokenizer = RobertaTokenizer.from_pretrained('roberta-base') - model = RobertaForMultipleChoice.from_pretrained('roberta-base') - input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - start_positions = torch.tensor([1]) - end_positions = torch.tensor([3]) - outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions) - loss, start_scores, end_scores = outputs[:2] + tokenizer = RobertaTokenizer.from_pretrained('roberta-large') + model = RobertaForQuestionAnswering.from_pretrained('roberta-large') + question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet" + input_ids = tokenizer.encode(question, text) + start_scores, end_scores = model(torch.tensor([input_ids])) + all_tokens = tokenizer.convert_ids_to_tokens(input_ids) + print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) + # a nice puppet + # Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' + to be uploaded. """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP From d000195ee683c4cba15a739202140c04cdcc19bd Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:28:34 +0800 Subject: [PATCH 182/302] add comment for example_index and unique_id in single process --- transformers/data/processors/squad.py | 2 +- transformers/modeling_roberta.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index b4cd8dedac..f028141924 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -225,7 +225,7 @@ def squad_convert_example_to_features(example, max_seq_length, span['token_type_ids'], cls_index, p_mask.tolist(), - example_index=0, + example_index=0, # Can not set unique_id and example_index here. They will be set after multiple processing. unique_id=0, paragraph_len=span['paragraph_len'], token_is_max_context=span["token_is_max_context"], diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index ea9211cbb9..05b7ad345b 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -592,8 +592,8 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) - # a nice puppet - # Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' + a nice puppet + Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' to be uploaded. """ config_class = RobertaConfig From 805c21aebacc00f447affd35dbc159d7f64d10dd Mon Sep 17 00:00:00 2001 From: erenup Date: Tue, 17 Dec 2019 11:36:00 +0800 Subject: [PATCH 183/302] tried to fix the failed checks --- transformers/modeling_roberta.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index 05b7ad345b..85f9e6e461 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -591,10 +591,7 @@ class RobertaForQuestionAnswering(BertPreTrainedModel): input_ids = tokenizer.encode(question, text) start_scores, end_scores = model(torch.tensor([input_ids])) all_tokens = tokenizer.convert_ids_to_tokens(input_ids) - print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])) - a nice puppet - Note: 'roberta-large' model can not produce the right answer above. Waiting for 'roberta-large-finetuned-squad' - to be uploaded. + answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]) """ config_class = RobertaConfig pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP From f349826a57e6f7f1eb5c28ef3b3ff0ac6884ad24 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Tue, 17 Dec 2019 10:36:04 +0100 Subject: [PATCH 184/302] model: fix cls and sep token for XLM-RoBERTa documentation --- transformers/modeling_xlm_roberta.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py index 8402be4b5c..4c833c69ff 100644 --- a/transformers/modeling_xlm_roberta.py +++ b/transformers/modeling_xlm_roberta.py @@ -111,17 +111,17 @@ class XLMRobertaModel(RobertaModel): Last layer hidden-state of the first token of the sequence (classification token) further processed by a Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence prediction (classification) - eo match pre-training, XLM-RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows: + eo match pre-training, XLM-RoBERTa input sequence should be formatted with and tokens as follows: (a) For sequence pairs: - ``tokens: [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]`` + ``tokens: is this jack ##son ##ville ? no it is not . `` ``token_type_ids: 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1`` (b) For single sequences: - ``tokens: [CLS] the dog is hairy . [SEP]`` + ``tokens: the dog is hairy . `` ``token_type_ids: 0 0 0 0 0 0 0`` From d7c62661a314c631b3bbf6405143934c8c3e8b5f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 17 Dec 2019 11:23:39 +0100 Subject: [PATCH 185/302] Provide serving dependencies for tensorflow and pytorch (serving-tf, serving-torch) --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 0b7e512955..b3b6e2e063 100644 --- a/setup.py +++ b/setup.py @@ -38,7 +38,9 @@ from setuptools import find_packages, setup extras = { - 'serving': ['uvicorn', 'fastapi'] + 'serving': ['uvicorn', 'fastapi'], + 'serving-tf': ['uvicorn', 'fastapi', 'tensorflow'], + 'serving-torch': ['uvicorn', 'fastapi', 'torch'] } extras['all'] = [package for package in extras.values()] From 2f1c745cded91b2f6cfed5b502ea5cbd7d6b9ac7 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 17 Dec 2019 11:47:54 +0100 Subject: [PATCH 186/302] update conversion script --- ...onvert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py index 6873f1d0f0..884c273d2c 100644 --- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -47,7 +47,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece') roberta.eval() # disable dropout config = BertConfig( - vocab_size_or_config_json_file=250002, + vocab_size=250002, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, From 2fde5a2489bc4aa7fc42ab76effde241b2a0b919 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 17 Dec 2019 12:16:07 +0100 Subject: [PATCH 187/302] Initial bunch of documentation. --- transformers/pipelines.py | 118 +++++++++++++++++++++++++++++++++++--- 1 file changed, 111 insertions(+), 7 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 2a8f26b03e..6dcb865c74 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -80,6 +80,15 @@ class _ScikitCompat(ABC): class PipelineDataFormat: + """ + Base class for all the pipeline supported data format both for reading and writing. + Supported data formats currently includes: + - JSON + - CSV + + PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns + to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. + """ SUPPORTED_FORMATS = ['json', 'csv'] def __init__(self, output: str, path: str, column: str): @@ -138,7 +147,6 @@ class CsvPipelineDataFormat(PipelineDataFormat): class JsonPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: str, path: str, column: str): super().__init__(output, path, column) @@ -158,6 +166,11 @@ class JsonPipelineDataFormat(PipelineDataFormat): class Pipeline(_ScikitCompat): + """ + Base class implementing pipelined operations. + Pipeline workflow is defined as a sequence of the following operations: + Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output + """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, args_parser: ArgumentHandler = None, device: int = -1, **kwargs): @@ -171,6 +184,9 @@ class Pipeline(_ScikitCompat): self.model = self.model.to('cuda:{}'.format(self.device)) def save_pretrained(self, save_directory): + """ + Save the pipeline's model and tokenizer to the specified save_directory + """ if not os.path.isdir(save_directory): logger.error("Provided path ({}) should be a directory".format(save_directory)) return @@ -179,9 +195,16 @@ class Pipeline(_ScikitCompat): self.tokenizer.save_pretrained(save_directory) def transform(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + """ return self(X=X) def predict(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + Se + """ return self(X=X) def __call__(self, *texts, **kwargs): @@ -198,6 +221,17 @@ class Pipeline(_ScikitCompat): @contextmanager def device_placement(self): + """ + Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. + example: + # Explicitly ask for tensor allocation on CUDA device :0 + nlp = pipeline(..., device=0) + with nlp.device_placement(): + # Every framework specific tensor allocation will be done on the request device + output = nlp(...) + Returns: + Context manager + """ if is_tf_available(): import tensorflow as tf with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)): @@ -210,6 +244,13 @@ class Pipeline(_ScikitCompat): yield def _forward(self, inputs): + """ + Internal framework specific forward dispatching. + Args: + inputs: dict holding all the keyworded arguments for required by the model forward method. + Returns: + Numpy array + """ if is_tf_available(): # TODO trace model predictions = self.model(inputs)[0] @@ -222,11 +263,17 @@ class Pipeline(_ScikitCompat): class FeatureExtractionPipeline(Pipeline): + """ + Feature extraction pipeline using Model head. + """ def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() class TextClassificationPipeline(Pipeline): + """ + Text classification pipeline using ModelForTextClassification head. + """ def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): super().__init__(model, tokenizer) @@ -239,7 +286,9 @@ class TextClassificationPipeline(Pipeline): class NerPipeline(Pipeline): - + """ + Named Entity Recognition pipeline using ModelForTokenClassification head. + """ def __init__(self, model, tokenizer: PreTrainedTokenizer): super().__init__(model, tokenizer) @@ -286,7 +335,7 @@ class NerPipeline(Pipeline): class QuestionAnsweringPipeline(Pipeline): """ - Question Answering pipeline involving Tokenization and Inference. + Question Answering pipeline using ModelForQuestionAnswering head. """ class QuestionAnsweringArgumentHandler(ArgumentHandler): @@ -341,9 +390,15 @@ class QuestionAnsweringPipeline(Pipeline): @staticmethod def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: - is_list = isinstance(question, list) - - if is_list: + """ + QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. + This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). + We currently support extractive question answering. + Args: + question: (str, List[str]) The question to be ask for the associated context + context: (str, List[str]) The context in which we will look for the answer. + """ + if isinstance(question, list): return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] else: return SquadExample(None, question, context, None, None, None) @@ -352,6 +407,12 @@ class QuestionAnsweringPipeline(Pipeline): super().__init__(model, tokenizer, args_parser=QuestionAnsweringPipeline.QuestionAnsweringArgumentHandler()) def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: + """ + Generates the input dictionary with model-specific parameters. + + Returns: + dict holding all the required parameters for model's forward + """ args = ['input_ids', 'attention_mask'] model_type = type(self.model).__name__.lower() @@ -367,6 +428,20 @@ class QuestionAnsweringPipeline(Pipeline): return {k: [feature.__dict__[k] for feature in features] for k in args} def __call__(self, *texts, **kwargs): + """ + Args: + We support multiple use-cases, the following are exclusive: + X: sequence of SquadExample + data: sequence of SquadExample + question: (str, List[str]), batch of question(s) to map along with context + context: (str, List[str]), batch of context(s) associated with the provided question keyword argument + Returns: + dict: {'answer': str, 'score": float, 'start": int, "end": int} + answer: the textual answer in the intial context + score: the score the current answer scored for the model + start: the character index in the original string corresponding to the beginning of the answer' span + end: the character index in the original string corresponding to the ending of the answer' span + """ # Set defaults values kwargs.setdefault('topk', 1) kwargs.setdefault('doc_stride', 128) @@ -432,6 +507,19 @@ class QuestionAnsweringPipeline(Pipeline): return answers def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple: + """ + Take the output of any QuestionAnswering head and will generate probalities for each span to be + the actual answer. + In addition, it filters out some unwanted/impossible cases like answer len being greater than + max_answer_len or answer end position being before the starting position. + The method supports output the k-best answer through the topk argument. + + Args: + start: numpy array, holding individual start probabilities for each token + end: numpy array, holding individual end probabilities for each token + topk: int, indicates how many possible answer span(s) to extract from the model's output + max_answer_len: int, maximum size of the answer to extract from the model's output + """ # Ensure we have batch axis if start.ndim == 1: start = start[None] @@ -459,6 +547,18 @@ class QuestionAnsweringPipeline(Pipeline): return start, end, candidates[0, start, end] def span_to_answer(self, text: str, start: int, end: int): + """ + When decoding from token probalities, this method maps token indexes to actual word in + the initial context. + + Args: + text: str, the actual context to extract the answer from + start: int, starting answer token index + end: int, ending answer token index + + Returns: + dict: {'answer': str, 'start': int, 'end': int} + """ words = [] token_idx = char_start_idx = char_end_idx = chars_idx = 0 @@ -514,7 +614,11 @@ SUPPORTED_TASKS = { def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: """ - Utility factory method to build pipeline. + Utility factory method to build a pipeline. + Pipeline are made of: + A Tokenizer instance in charge of mapping raw textual input to token + A Model instance + Some (optional) post processing for enhancing model's output """ # Try to infer tokenizer from model name (if provided as str) if tokenizer is None: From d303f84e7bca4d70eb9685e620f74454d940b991 Mon Sep 17 00:00:00 2001 From: Gunnlaugur Thor Briem Date: Tue, 17 Dec 2019 16:18:00 +0000 Subject: [PATCH 188/302] fix: wrong architecture count in README MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Just say “the following” so that this intro doesn't so easily fall out of date :) ) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c33a65bdbb..e8f883c3e1 100644 --- a/README.md +++ b/README.md @@ -132,7 +132,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training ## Model architectures -🤗 Transformers currently provides 10 NLU/NLG architectures: +🤗 Transformers currently provides the following NLU/NLG architectures: 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. From b6938916ac7f00cd260e70d54b252909c40bced6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 17 Dec 2019 17:23:36 +0100 Subject: [PATCH 189/302] adding beam search --- transformers/configuration_utils.py | 9 +- transformers/modeling_utils.py | 267 ++++++++++++++++++++++++---- 2 files changed, 235 insertions(+), 41 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 9c3360892d..8c3e0a9f9c 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -62,13 +62,18 @@ class PretrainedConfig(object): self.is_decoder = kwargs.pop('is_decoder', False) # Parameters for sequence generation - self.generate_length = kwargs.pop('generate_length', 10) + self.generate_max_length = kwargs.pop('generate_max_length', 20) self.generate_do_sample = kwargs.pop('generate_do_sample', False) self.generate_num_beams = kwargs.pop('generate_num_beams', 1) self.generate_temperature = kwargs.pop('generate_temperature', 1.0) self.generate_top_k = kwargs.pop('generate_top_k', 50) - self.generate_top_p = kwargs.pop('generate_top_p', 0.0) + self.generate_top_p = kwargs.pop('generate_top_p', 1.0) self.generate_repetition_penalty = kwargs.pop('generate_repetition_penalty', 1.0) + self.generate_bos_token_id = kwargs.pop('generate_bos_token_id', 0) + self.generate_pad_token_id = kwargs.pop('generate_pad_token_id', 0) + self.generate_eos_token_ids = kwargs.pop('generate_eos_token_ids', 0) + self.generate_batch_size = kwargs.pop('generate_batch_size', 1) + self.generate_length_penalty = kwargs.pop('generate_length_penalty', 1.) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 27d42c552a..003e17a0d9 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -1,5 +1,5 @@ # coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); @@ -488,63 +488,252 @@ class PreTrainedModel(nn.Module): return model - def generate(self, input_ids=None, length=None, do_sample=False, num_beams=None, - temperature=None, top_k=None, top_p=None, repetition_penalty=None, - **model_kwargs): - """ Generic sequence generator for single-stack models with a LM head. + def prepare_inputs_for_generation(self, input_ids, **kwargs): + return {"input_ids": input_ids} - The method currently supports greedy decoding and sampling. See the - documentation of the `Sampler` class for more information about the - parameters related to sampling. + def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, + temperature=None, top_k=None, top_p=None, repetition_penalty=None, + bos_token_id=None, pad_token_id=None, eos_token_ids=None, batch_size=None, + length_penalty=None, **kwargs): + """ Sequence generator for models with a LM head. + + The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling + and beam-search. + + Adapted in part from Facebook's XLM beam search code: https://github.com/facebookresearch/XLM Params: **input_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) The sequence used as a prompt for the generation. If `None` the method initializes it as an empty `torch.LongTensor` of shape (1,) - **length**: (`optional`) int - The length of the sequence to be generated. + **max_length**: (`optional`) int + The max length of the sequence to be generated. Between 1 and infinity. Default to 20. **do_sample**: (`optional`) bool - If set to `False` we use greedy decoding; otherwise sampling. + If set to `False` we use greedy decoding; otherwise sampling. Default to greedy sampling. + **num_beams**: (`optional`) int + Number of beams for beam search. 1 means no beam serach. Default to 1. **temperature**: (`optional`) float The value used to module the next token probabilities. - **k**: (`optional`) int - The parameter used for k-filtering. - **p**: (`optional`) float - The parameter for nucleus sampling. Must be between 0 and 1. + **top_k**: (`optional`) int + The number of highest probability vocabulary tokens to keep for top-k-filtering. Between 1 and infinity. Default to 50. + **top_p**: (`optional`) float + The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. **repetition_penalty**: (`optional`) float - The parameter for repetition penalty. + The parameter for repetition penalty. Between 1.0 and + infinity. 1.0 means no penalty. Default to 1. """ - if input_ids is None: - input_ids = torch.tensor([[]], dtype=torch.long, device=next(self.parameters()).device) - # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") - sampler_config = { - "k": k, - "p": p, - "do_sample": do_sample, - "temperature": temperature, - "repetition_penalty": repetition_penalty, - } + max_length = max_length if max_length is not None else self.config.generate_max_length + do_sample = do_sample if do_sample is not None else self.config.generate_do_sample + num_beams = num_beams if num_beams is not None else self.config.generate_num_beams + temperature = temperature if temperature is not None else self.config.generate_temperature + top_k = top_k if top_k is not None else self.config.generate_top_k + top_p = top_p if top_p is not None else self.config.generate_top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.generate_repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.generate_bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.generate_pad_token_id + eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.generate_eos_token_ids + batch_size = batch_size if batch_size is not None else self.config.generate_batch_size + length_penalty = length_penalty if length_penalty is not None else self.config.generate_length_penalty - sampler = Sampler(**sampler_config) - generated_sequence = input_ids - for _ in trange(length): - arguments = self._prepare_inputs_for_decoding(generated_sequence, **model_kwargs) - outputs = self(**arguments) - next_tokens_logits = outputs[0][:, -1, :] - next_tokens = sampler.get_one_token( - next_tokens_logits, generated_sequence - ) - generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) + if input_ids is not None: + batch_size = input_ids.shape[0] # overriden by the input batch_size + if isinstance(eos_token_ids, int): + eos_token_ids = [eos_token_ids] - return generated_sequence.squeeze(0) + assert isinstance(max_length, int) and 0 < max_length, "`max_length` should be a strictely positive integer." + assert isinstance(do_sample, bool), "`do_sample` should be a boolean." + assert isinstance(num_beams, int) and 0 < num_beams, "`num_beams` should be a strictely positive integer." + assert 0 < temperature, "`temperature` should be positive." + assert isinstance(top_k, int) and 0 < top_k, "`top_k` should be a strictely positive integer." + assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." + assert 0 < repetition_penalty, "`repetition_penalty` should be strictely positive." + assert isinstance(bos_token_id, int) and 0 <= bos_token_id, "`bos_token_id` should be a positive integer." + assert isinstance(pad_token_id, int) and 0 <= pad_token_id, "`pad_token_id` should be a positive integer." + assert isinstance(eos_token_ids, (list, tuple)) and (0 <= e for e in eos_token_ids), \ + "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." + assert isinstance(batch_size, int) and 0 < batch_size, "`batch_size` should be a strictely positive integer." + assert 0 < length_penalty, "`length_penalty` should be strictely positive." - def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): - return model_kwargs.update({"input_ids": input_ids}) + if input_ids is None: + input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device) + else: + assert input_ids.dims() == 2 + + # current position and vocab size + cur_len = 1 + vocab_size = self.config.vocab_size + + # Expand input to num beams + input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) + input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) + + # generated hypotheses + generated_hyps = [BeamHypotheses(num_beams, max_length, length_penalty, early_stopping=False) for _ in range(batch_size)] + + # scores for each sentence in the beam + beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) + beam_scores[:, 1:] = -1e9 + beam_scores = beam_scores.view(-1) + + # cache compute states + pasts = None # self.prepare_pasts() + + # done sentences + done = [False for _ in range(batch_size)] + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) + scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) + scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) + scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) + assert scores.size() == (batch_size * num_beams, vocab_size) + + # select next words with scores + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) + + next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) + assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) + + # next batch beam content + # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch) + next_batch_beam = [] + + # for each sentence + for sent_id in range(batch_size): + + # if we are done with this sentence + done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item()) + if done[sent_id]: + next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch + continue + + # next sentence beam content + next_sent_beam = [] + + # next words for this sentence + for idx, value in zip(next_words[sent_id], next_scores[sent_id]): + + # get beam and word IDs + beam_id = idx // vocab_size + word_id = idx % vocab_size + + # end of sentence, or next word + if word_id.item() in eos_token_ids or cur_len + 1 == max_length: + generated_hyps[sent_id].add(input_ids[sent_id * num_beams + beam_id, :cur_len].clone(), value.item()) + else: + next_sent_beam.append((value, word_id, sent_id * num_beams + beam_id)) + + # the beam for next step is full + if len(next_sent_beam) == num_beams: + break + + # update next beam content + assert len(next_sent_beam) == 0 if cur_len + 1 == max_length else num_beams + if len(next_sent_beam) == 0: + next_sent_beam = [(0, pad_token_id, 0)] * num_beams # pad the batch + next_batch_beam.extend(next_sent_beam) + assert len(next_batch_beam) == num_beams * (sent_id + 1) + + # sanity check / prepare next batch + assert len(next_batch_beam) == batch_size * num_beams + beam_scores = beam_scores.new([x[0] for x in next_batch_beam]) + beam_words = input_ids.new([x[1] for x in next_batch_beam]) + beam_idx = input_ids.new([x[2] for x in next_batch_beam]) + + # re-order batch and internal states + input_ids = input_ids[beam_idx, :] + input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1) + # TODO: Activate cache + # for k in cache.keys(): + # if k != 'slen': + # cache[k] = (cache[k][0][beam_idx], cache[k][1][beam_idx]) + + # update current length + cur_len = cur_len + 1 + + # stop when we are done with each sentence + if all(done): + break + + # visualize hypotheses + # print([len(x) for x in generated_hyps], cur_len) + # globals().update( locals() ); + # !import code; code.interact(local=vars()) + # for ii in range(batch_size): + # for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True): + # print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist())) + # print("") + + # select the best hypotheses + tgt_len = src_len.new(batch_size) + best = [] + + for i, hypotheses in enumerate(generated_hyps): + best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] + tgt_len[i] = len(best_hyp) + 1 # +1 for the symbol + best.append(best_hyp) + + # generate target batch + decoded = src_len.new(tgt_len.max().item(), batch_size).fill_(self.pad_index) + for i, hypo in enumerate(best): + decoded[:tgt_len[i] - 1, i] = hypo + decoded[tgt_len[i] - 1, i] = self.eos_index + + # sanity check + assert (decoded == self.eos_index).sum() == 2 * batch_size + + return decoded, tgt_len + + +class BeamHypotheses(object): + + def __init__(self, n_hyp, max_length, length_penalty, early_stopping): + """ + Initialize n-best list of hypotheses. + """ + self.max_length = max_length - 1 # ignoring bos_token + self.length_penalty = length_penalty + self.early_stopping = early_stopping + self.n_hyp = n_hyp + self.hyp = [] + self.worst_score = 1e9 + + def __len__(self): + """ + Number of hypotheses in the list. + """ + return len(self.hyp) + + def add(self, hyp, sum_logprobs): + """ + Add a new hypothesis to the list. + """ + score = sum_logprobs / len(hyp) ** self.length_penalty + if len(self) < self.n_hyp or score > self.worst_score: + self.hyp.append((score, hyp)) + if len(self) > self.n_hyp: + sorted_scores = sorted([(s, idx) for idx, (s, _) in enumerate(self.hyp)]) + del self.hyp[sorted_scores[0][1]] + self.worst_score = sorted_scores[1][0] + else: + self.worst_score = min(score, self.worst_score) + + def is_done(self, best_sum_logprobs): + """ + If there are enough hypotheses and that none of the hypotheses being generated + can become better than the worst one in the heap, then we are done with this sentence. + """ + if len(self) < self.n_hyp: + return False + elif self.early_stopping: + return True + else: + return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty class Sampler(object): From 55397dfb9b7e61001e10abb595931d4a98ae58b0 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 17 Dec 2019 13:10:51 -0500 Subject: [PATCH 190/302] CsvPipelineDataFormat: Fix for single-column --- transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 6dcb865c74..dec7843baf 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -136,7 +136,7 @@ class CsvPipelineDataFormat(PipelineDataFormat): if self.is_multi_columns: yield {k: row[c] for k, c in self.column} else: - yield row[self.column] + yield row[self.column[0]] def save(self, data: List[dict]): with open(self.output, 'w') as f: From 2cff4bd8f3ad412917f4f295b97b952e297fa257 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 17 Dec 2019 14:01:04 -0500 Subject: [PATCH 191/302] Fix segmentation fault --- transformers/file_utils.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 81c9b8002f..16010f7e0a 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -26,14 +26,6 @@ from contextlib import contextmanager logger = logging.getLogger(__name__) # pylint: disable=invalid-name -try: - import tensorflow as tf - assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 - _tf_available = True # pylint: disable=invalid-name - logger.info("TensorFlow version {} available.".format(tf.__version__)) -except (ImportError, AssertionError): - _tf_available = False # pylint: disable=invalid-name - try: import torch _torch_available = True # pylint: disable=invalid-name @@ -41,6 +33,13 @@ try: except ImportError: _torch_available = False # pylint: disable=invalid-name +try: + import tensorflow as tf + assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 + _tf_available = True # pylint: disable=invalid-name + logger.info("TensorFlow version {} available.".format(tf.__version__)) +except (ImportError, AssertionError): + _tf_available = False # pylint: disable=invalid-name try: from torch.hub import _get_torch_home From 5e289f69bc564c94132f77c89a34e5f1dd69a592 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Tue, 17 Dec 2019 14:17:11 -0500 Subject: [PATCH 192/302] regex 2019.12.17 install fails with Python 2 --- requirements.txt | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index 9c43abc6d7..32edee0712 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,7 +5,7 @@ boto3 # Used for downloading models over HTTP requests # For OpenAI GPT -regex +regex != 2019.12.17 # For XLNet sentencepiece # For XLM diff --git a/setup.py b/setup.py index eacb5ecec0..bf09a7d48a 100644 --- a/setup.py +++ b/setup.py @@ -59,7 +59,7 @@ setup( 'boto3', 'requests', 'tqdm', - 'regex', + 'regex != 2019.12.17', 'sentencepiece', 'sacremoses'], entry_points={ From bbc0c86f9b96b62b95853a18945f855c661a13b9 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 17 Dec 2019 23:27:02 +0100 Subject: [PATCH 193/302] beam search + single beam decoding --- transformers/modeling_utils.py | 152 ++++++++++++++++++++++++++------- 1 file changed, 123 insertions(+), 29 deletions(-) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 003e17a0d9..52743d8c2f 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -544,29 +544,90 @@ class PreTrainedModel(nn.Module): if isinstance(eos_token_ids, int): eos_token_ids = [eos_token_ids] - assert isinstance(max_length, int) and 0 < max_length, "`max_length` should be a strictely positive integer." + assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." - assert isinstance(num_beams, int) and 0 < num_beams, "`num_beams` should be a strictely positive integer." - assert 0 < temperature, "`temperature` should be positive." - assert isinstance(top_k, int) and 0 < top_k, "`top_k` should be a strictely positive integer." + assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." + assert temperature > 0, "`temperature` should be positive." + assert isinstance(top_k, int) and top_k > 0, "`top_k` should be a strictely positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." - assert 0 < repetition_penalty, "`repetition_penalty` should be strictely positive." - assert isinstance(bos_token_id, int) and 0 <= bos_token_id, "`bos_token_id` should be a positive integer." - assert isinstance(pad_token_id, int) and 0 <= pad_token_id, "`pad_token_id` should be a positive integer." - assert isinstance(eos_token_ids, (list, tuple)) and (0 <= e for e in eos_token_ids), \ + assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." + assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer." + assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer." + assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \ "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." - assert isinstance(batch_size, int) and 0 < batch_size, "`batch_size` should be a strictely positive integer." - assert 0 < length_penalty, "`length_penalty` should be strictely positive." + assert isinstance(batch_size, int) and batch_size > 0, "`batch_size` should be a strictely positive integer." + assert length_penalty > 0, "`length_penalty` should be strictely positive." if input_ids is None: input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device) else: - assert input_ids.dims() == 2 + assert input_ids.dims() == 2, "Input prompt should be of shape (batch_size, sequence length)." # current position and vocab size - cur_len = 1 + cur_len = input_ids.shape[1] vocab_size = self.config.vocab_size + if num_beams > 1: + return self._generate_beam_search(input_ids, cur_len, max_length, do_sample, length_penalty, + num_beams, pad_token_id, eos_token_ids, vocab_size, batch_size) + + return self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, + temperature, top_k, top_p, repetition_penalty, + pad_token_id, eos_token_ids, batch_size) + + def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, + temperature, top_k, top_p, repetition_penalty, + pad_token_id, eos_token_ids, batch_size): + """ Generate a sentence without beam search (num_beams == 1). """ + # current position / max lengths / length of generated sentences / unfinished sentences + unfinished_sents = input_ids.new(batch_size).fill_(1) + + # cache compute states + pasts = None + + while cur_len < max_length: + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) + outputs = self(**model_inputs) + next_token_logits = outputs[0][:, -1, :] + + # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + for i in range(batch_size): + for _ in set(input_ids[i].tolist()): + next_token_logits[i, _] /= repetition_penalty + + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + next_token_logits = next_token_logits / temperature + # Top-p/top-k filtering + next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) + # Sample + next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) + else: + # Greedy decoding + next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1) + + # update generations and finished sentences + tokens_to_add = next_token * unfinished_sents + pad_token_id * (1 - unfinished_sents) + input_ids = torch.cat([input_ids, tokens_to_add], dim=-1) + for eos_token_id in eos_token_ids: + unfinished_sents.mul_(tokens_to_add.squeeze(-1).ne(eos_token_id).long()) + cur_len = cur_len + 1 + + # stop when there is a in each sentence, or if we exceed the maximul length + if unfinished_sents.max() == 0: + break + + # add eos_token_ids to unfinished sentences + if cur_len == max_length: + input_ids[:, -1].masked_fill_(unfinished_sents.byte(), eos_token_ids[0]) + + return input_ids + + def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, length_penalty, + num_beams, pad_token_id, eos_token_ids, vocab_size, batch_size): + """ Generate a sentence with beam search. """ # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) @@ -592,9 +653,11 @@ class PreTrainedModel(nn.Module): scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) assert scores.size() == (batch_size * num_beams, vocab_size) - # select next words with scores - _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) - _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) + # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) @@ -604,11 +667,11 @@ class PreTrainedModel(nn.Module): next_batch_beam = [] # for each sentence - for sent_id in range(batch_size): + for batch_ex in range(batch_size): # if we are done with this sentence - done[sent_id] = done[sent_id] or generated_hyps[sent_id].is_done(next_scores[sent_id].max().item()) - if done[sent_id]: + done[batch_ex] = done[batch_ex] or generated_hyps[batch_ex].is_done(next_scores[batch_ex].max().item()) + if done[batch_ex]: next_batch_beam.extend([(0, pad_token_id, 0)] * num_beams) # pad the batch continue @@ -616,7 +679,7 @@ class PreTrainedModel(nn.Module): next_sent_beam = [] # next words for this sentence - for idx, value in zip(next_words[sent_id], next_scores[sent_id]): + for idx, score in zip(next_words[batch_ex], next_scores[batch_ex]): # get beam and word IDs beam_id = idx // vocab_size @@ -624,9 +687,9 @@ class PreTrainedModel(nn.Module): # end of sentence, or next word if word_id.item() in eos_token_ids or cur_len + 1 == max_length: - generated_hyps[sent_id].add(input_ids[sent_id * num_beams + beam_id, :cur_len].clone(), value.item()) + generated_hyps[batch_ex].add(input_ids[batch_ex * num_beams + beam_id, :cur_len].clone(), score.item()) else: - next_sent_beam.append((value, word_id, sent_id * num_beams + beam_id)) + next_sent_beam.append((score, word_id, batch_ex * num_beams + beam_id)) # the beam for next step is full if len(next_sent_beam) == num_beams: @@ -637,7 +700,7 @@ class PreTrainedModel(nn.Module): if len(next_sent_beam) == 0: next_sent_beam = [(0, pad_token_id, 0)] * num_beams # pad the batch next_batch_beam.extend(next_sent_beam) - assert len(next_batch_beam) == num_beams * (sent_id + 1) + assert len(next_batch_beam) == num_beams * (batch_ex + 1) # sanity check / prepare next batch assert len(next_batch_beam) == batch_size * num_beams @@ -670,7 +733,7 @@ class PreTrainedModel(nn.Module): # print("") # select the best hypotheses - tgt_len = src_len.new(batch_size) + tgt_len = input_ids.new(batch_size) best = [] for i, hypotheses in enumerate(generated_hyps): @@ -679,15 +742,46 @@ class PreTrainedModel(nn.Module): best.append(best_hyp) # generate target batch - decoded = src_len.new(tgt_len.max().item(), batch_size).fill_(self.pad_index) + decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) for i, hypo in enumerate(best): - decoded[:tgt_len[i] - 1, i] = hypo - decoded[tgt_len[i] - 1, i] = self.eos_index + decoded[i, :tgt_len[i] - 1] = hypo + decoded[i, tgt_len[i] - 1] = eos_token_ids[0] - # sanity check - assert (decoded == self.eos_index).sum() == 2 * batch_size + # # sanity check + # assert (decoded == eos_token_ids[0]).sum() == 2 * batch_size - return decoded, tgt_len + return decoded + + +def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): + """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering + Args: + logits: logits distribution shape (batch size x vocabulary size) + top_k > 0: keep only top k tokens with highest probability (top-k filtering). + top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 + """ + top_k = min(top_k, logits.size(-1)) # Safety check + if top_k > 0: + # Remove all tokens with a probability less than the last token of the top-k + indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] + logits[indices_to_remove] = filter_value + + if top_p > 0.0: + sorted_logits, sorted_indices = torch.sort(logits, descending=True) + cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) + + # Remove tokens with cumulative probability above the threshold + sorted_indices_to_remove = cumulative_probs > top_p + # Shift the indices to the right to keep also the first token above the threshold + sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() + sorted_indices_to_remove[..., 0] = 0 + + # scatter sorted tensors to original indexing + indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove) + logits[indices_to_remove] = filter_value + return logits class BeamHypotheses(object): From 77d397202ba3daa013c94696e9825de8e20145e8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Tue, 17 Dec 2019 23:28:46 +0100 Subject: [PATCH 194/302] clean up dead code --- transformers/modeling_utils.py | 140 --------------------------------- 1 file changed, 140 deletions(-) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 52743d8c2f..0e285c4f6b 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -830,146 +830,6 @@ class BeamHypotheses(object): return self.worst_score >= best_sum_logprobs / self.max_length ** self.length_penalty -class Sampler(object): - r""" Sampler is used to generate sequences of ids from logit inputs. - - Greedy decoding, which consists in chosing the most probable token at each - step, is the default behaviour. Sampling with varying temperature, top_k - and nucleus filtering is also implemented. - - Attributes: - **device**: ``torch.device`` - Device on which the computations will be run. - **do_sample**: bool - Whether to sample or do greedy decoding. - **k**: int between 0 and vocab_size - Parameter for the top-k filtering - **p**: float between 0 and 1 - Parameter for the nucleus filtering - **temperature**: strictly positive float - Parameter used to modulate the distribution over ids. Low temperatures - put more emphasis on highly probably token while high temperatures tend - to smooth the probability distribution. - **repetition_penalty**: strictly postitive float - The penalty applied to repeating ids - """ - - def __init__( - self, do_sample=False, k=9, p=0.0, temperature=1.0, repetition_penalty=1.0 - ): - self.k = k - self.p = p - self.do_sample = do_sample - self.temperature = temperature - self.repetition_penalty = repetition_penalty - - self.do_apply_repetition_penalty = True if repetition_penalty > 1 else False - - if self.p > 1: - warnings.warn( - """You are trying to apply nucleus filtering with a value of p greater than 1 ({}). - However p is a probability and its value must lie between 0 and 1. In effect, no filtering - will be applied. If this is not the behavior you expect, change the value of p.""".format( - self.p - ) - ) - - def get_one_token(self, next_token_logits, past_sequence): - logits = self.apply_repetition_penalty(next_token_logits, past_sequence) - if self.do_sample: - logits = self.apply_temperature(logits) - logits = self.apply_top_k_filter(logits) - logits = self.apply_nucleus_filter(logits) - return torch.multinomial(F.softmax(logits, dim=-1), num_samples=1) - return torch.argmax(logits, dim=-1).unsqueeze(-1) - - def apply_repetition_penalty(self, logits, past_sequence): - """ Apply a penalty to tokens that appear more than once in the - generated sequence. - - .. Keskar, Nitish Shirish, et al. "Ctrl: A conditional transformer - language model for controllable generation." arXiv preprint - arXiv:1909.05858 (2019). - """ - if self.do_apply_repetition_penalty: - generated_token_idx = set(past_sequence[0].tolist()) - for token_idx in generated_token_idx: - logits[0, token_idx] /= self.repetition_penalty - return logits - - def apply_temperature(self, logits): - """ Shape the tokens' distribution through temperature. The higher the value - of the temperature, the more skewed towards high probability events the - distribution is. - - .. Goodfellow, Ian, Yoshua Bengio, and Aaron Courville. Deep learning. - MIT press, 2016. - """ - # when dividing a float by 0, torch returns inf which in turns breaks the - # multinomial with an error message that is not very helpful. It is better - # for the user to break the execution and explain why. - if self.temperature == 0: - raise ZeroDivisionError( - """You are trying to sample with a temperature equal to 0. - If you wanted to do greedy sampling, set instead `do_sample` to False. - Otherwise set the temperature to a value different from 0.""" - ) - return logits / self.temperature - - def apply_top_k_filter(self, logits): - """ Use the probability distribution of the tokens to determine the set - to be sampled from. Specifically we select the set of size k such that - the sum of its items' probabilities is maximum. - - .. Fan, Angela, Mike Lewis, and Yann Dauphin. "Hierarchical neural - story generation." arXiv preprint arXiv:1805.04833 (2018). - """ - if self.k > 0: - vocabulary_size = logits.size(-1) - if self.k > vocabulary_size: - warnings.warn( - """You provided a value for k ({}) that is larger than the vocabulary size ({}). - We adjusted k's value to the vocabulary size; if that was what you intended to do - we recommend setting k to 0 instead. It this is not the behavior you expected, - choose a value of k that is smaller than the vocabulary size.""".format( - self.k, vocabulary_size - ) - ) - self.k = vocabulary_size - - indices_to_remove = logits < torch.topk(logits, self.k)[0][..., -1, None] - logits[indices_to_remove] = -float("Inf") - - return logits - - def apply_nucleus_filter(self, logits): - """ Use the probability distribution of the tokens to determine the set - to be sampled from. Specifically, choose the smallest set such that the - sum of its items' probabilities is greater than a number p in [0,1]. - - .. Holtzman, Ari, et al. "The curious case of neural text - degeneration." arXiv preprint arXiv:1904.09751 (2019). - """ - if self.p > 0: - sorted_logits, sorted_indices = torch.sort(logits, descending=True) - sorted_probabilities = F.softmax(sorted_logits, dim=-1) - cumulative_probabilities = torch.cumsum(sorted_probabilities, dim=-1) - - # Remove tokens with cumulative probability above the threshold, - # but keep the first token above the threshold. - sorted_indices_to_remove = cumulative_probabilities > self.p - sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() - sorted_indices_to_remove[..., 0] = 0 - - # scatter sorted tensors to original indexing - indices_to_remove = sorted_indices_to_remove.scatter( - dim=-1, index=sorted_indices, src=sorted_indices_to_remove - ) - logits[indices_to_remove] = -float("Inf") - - return logits - - class Conv1D(nn.Module): def __init__(self, nf, nx): """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2) From a4df2e011367020253c8ca8a714c4b4855ff61bc Mon Sep 17 00:00:00 2001 From: Arman Cohan Date: Tue, 26 Nov 2019 16:03:07 -0800 Subject: [PATCH 195/302] update roberta conversion - update to fix conversion for the updated fairseq model - create save directory if not exist --- ..._original_pytorch_checkpoint_to_pytorch.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index b4dc1bb61b..be3460a86f 100644 --- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -20,6 +20,7 @@ import argparse import logging import numpy as np import torch +import pathlib from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer @@ -79,15 +80,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ ### self attention self_attn: BertSelfAttention = layer.attention.self assert( - roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size)) + roberta_layer.self_attn.k_proj.weight.data.shape == \ + roberta_layer.self_attn.q_proj.weight.data.shape == \ + roberta_layer.self_attn.v_proj.weight.data.shape == \ + torch.Size((config.hidden_size, config.hidden_size)) ) - # we use three distinct linear layers so we split the source layer here. - self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :] - self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size] - self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :] - self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size] - self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :] - self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:] + + self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight + self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias + self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight + self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias + self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight + self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias ### self-attention output self_output: BertSelfOutput = layer.attention.output @@ -151,6 +155,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ if not success: raise Exception("Something went wRoNg") + pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) print(f"Saving model to {pytorch_dump_folder_path}") model.save_pretrained(pytorch_dump_folder_path) From ea636440d1ea3497785c2682c410da478f8b1841 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 17 Dec 2019 18:06:42 -0500 Subject: [PATCH 196/302] [roberta.conversion] Do not hardcode vocab size and support for fairseq 0.9+ --- ...t_roberta_original_pytorch_checkpoint_to_pytorch.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py index be3460a86f..fedfc1ecb8 100644 --- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py +++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py @@ -22,6 +22,12 @@ import numpy as np import torch import pathlib +import fairseq +from packaging import version + +if version.parse(fairseq.__version__) < version.parse("0.9.0"): + raise Exception("requires fairseq >= 0.9.0") + from fairseq.models.roberta import RobertaModel as FairseqRobertaModel from fairseq.modules import TransformerSentenceEncoderLayer from transformers.modeling_bert import (BertConfig, BertEncoder, @@ -46,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ """ roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path) roberta.eval() # disable dropout + roberta_sent_encoder = roberta.model.decoder.sentence_encoder config = BertConfig( - vocab_size=50265, + vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings, hidden_size=roberta.args.encoder_embed_dim, num_hidden_layers=roberta.args.encoder_layers, num_attention_heads=roberta.args.encoder_attention_heads, @@ -65,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_ # Now let's copy all the weights. # Embeddings - roberta_sent_encoder = roberta.model.decoder.sentence_encoder model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. From a0d386455b347508ea31fc88dd06cc5555255c37 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 17 Dec 2019 20:07:39 -0500 Subject: [PATCH 197/302] Fix outdated tokenizer doc --- templates/adding_a_new_model/tokenization_xxx.py | 2 +- transformers/tokenization_bert.py | 4 ++-- transformers/tokenization_distilbert.py | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py index 3d6b4ad9df..7a10a41e5a 100644 --- a/templates/adding_a_new_model/tokenization_xxx.py +++ b/templates/adding_a_new_model/tokenization_xxx.py @@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer): Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index ded5072e58..7ab8029da8 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -113,12 +113,12 @@ class BertTokenizer(PreTrainedTokenizer): Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. never_split: List of tokens which will never be split during tokenization. Only has an effect when - do_wordpiece_only=False + do_basic_tokenize=True """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py index f40bf2bd77..2f245d71dc 100644 --- a/transformers/tokenization_distilbert.py +++ b/transformers/tokenization_distilbert.py @@ -53,12 +53,12 @@ class DistilBertTokenizer(BertTokenizer): Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file - do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False + do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. never_split: List of tokens which will never be split during tokenization. Only has an effect when - do_wordpiece_only=False + do_basic_tokenize=True """ vocab_files_names = VOCAB_FILES_NAMES From 8ac840ff8758fb242e3e89cbc809366165ccf960 Mon Sep 17 00:00:00 2001 From: Antti Virtanen Date: Mon, 16 Dec 2019 17:08:25 +0200 Subject: [PATCH 198/302] Adding Finnish BERT. --- transformers/configuration_bert.py | 4 +++- transformers/modeling_bert.py | 4 +++- transformers/modeling_tf_bert.py | 4 +++- transformers/tokenization_bert.py | 6 ++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index 9072820bce..b1974966a9 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -45,7 +45,9 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json", 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json" + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", + 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/config.json", + 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/config.json", } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index d0f35272ac..d0cb5ec617 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -51,7 +51,9 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin", 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin" + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", + 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin", + 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin", } diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 7cc71f5063..20b5895dbd 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -51,7 +51,9 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5", 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", - 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5" + 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", + #'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin", + #'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin", } diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 7ab8029da8..6f6a4d6f19 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -46,6 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", + 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/vocab.txt", + 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/vocab.txt", } } @@ -65,6 +67,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'bert-base-cased-finetuned-mrpc': 512, 'bert-base-german-dbmdz-cased': 512, 'bert-base-german-dbmdz-uncased': 512, + 'bert-base-finnish-cased-v1': 512, + 'bert-base-finnish-uncased-v1': 512, } PRETRAINED_INIT_CONFIGURATION = { @@ -83,6 +87,8 @@ PRETRAINED_INIT_CONFIGURATION = { 'bert-base-cased-finetuned-mrpc': {'do_lower_case': False}, 'bert-base-german-dbmdz-cased': {'do_lower_case': False}, 'bert-base-german-dbmdz-uncased': {'do_lower_case': True}, + 'bert-base-finnish-cased-v1': {'do_lower_case': False}, + 'bert-base-finnish-uncased-v1': {'do_lower_case': True}, } From abc43ffbfff69dc91f354c34f1c7c5b48a5c1502 Mon Sep 17 00:00:00 2001 From: Antti Virtanen Date: Mon, 16 Dec 2019 18:08:00 +0200 Subject: [PATCH 199/302] Add pretrained model documentation for FinBERT. --- docs/source/pretrained_models.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index c6b990f213..7d037da34f 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -79,6 +79,14 @@ Here is the full list of the currently provided pretrained models together with | | ``bert-base-japanese-char-whole-word-masking`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters. | | | | (see `details on cl-tohoku repository `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-finnish-cased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on cased Finnish text. | +| | | (see `details on turkunlp.org `__). | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``bert-base-finnish-uncased-v1`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | +| | | | Trained on uncased Finnish text. | +| | | (see `details on turkunlp.org `__). | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | GPT | ``openai-gpt`` | | 12-layer, 768-hidden, 12-heads, 110M parameters. | | | | | OpenAI GPT English model | From c5f35e61db8d7286173515071e76612e9e5f5ce5 Mon Sep 17 00:00:00 2001 From: Antti Virtanen Date: Mon, 16 Dec 2019 21:06:14 +0200 Subject: [PATCH 200/302] Uploaded files to AWS. --- transformers/configuration_bert.py | 4 ++-- transformers/modeling_bert.py | 4 ++-- transformers/modeling_tf_bert.py | 4 ++-- transformers/tokenization_bert.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index b1974966a9..c2ccc578c2 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -46,8 +46,8 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", - 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/config.json", - 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/config.json", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-config.json", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-config.json", } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index d0cb5ec617..4e034f4b6e 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -52,8 +52,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", - 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin", - 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-pytorch_model.bin", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-pytorch_model.bin", } diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 20b5895dbd..5a989c299f 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -52,8 +52,8 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", - #'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin", - #'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-tf_model.h5", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-tf_model.h5", } diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 6f6a4d6f19..c11c1b4d3c 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -46,8 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", - 'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/vocab.txt", - 'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/vocab.txt", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt", } } From 7ffa8173905cb6d0819fc424a4806e81a44dd0e0 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Mon, 16 Dec 2019 18:55:14 -0500 Subject: [PATCH 201/302] [s3] mv files and update links --- transformers/configuration_bert.py | 4 ++-- transformers/modeling_bert.py | 4 ++-- transformers/modeling_tf_bert.py | 4 ++-- transformers/tokenization_bert.py | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py index c2ccc578c2..7b495013ff 100644 --- a/transformers/configuration_bert.py +++ b/transformers/configuration_bert.py @@ -46,8 +46,8 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-config.json", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-config.json", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json", } diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py index 4e034f4b6e..afeb9d8e21 100644 --- a/transformers/modeling_bert.py +++ b/transformers/modeling_bert.py @@ -52,8 +52,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-pytorch_model.bin", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-pytorch_model.bin", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin", } diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py index 5a989c299f..b4f97c06d9 100644 --- a/transformers/modeling_tf_bert.py +++ b/transformers/modeling_tf_bert.py @@ -52,8 +52,8 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = { 'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5", 'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5", 'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-tf_model.h5", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-tf_model.h5", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5", } diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index c11c1b4d3c..18b96c99b3 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -46,8 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = { 'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt", 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", - 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt", + 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", } } From 94c99db34cf9074a212c36554fb925c513d70ab1 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Tue, 17 Dec 2019 20:34:22 -0500 Subject: [PATCH 202/302] [FinBERT] fix incorrect url --- transformers/tokenization_bert.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py index 18b96c99b3..edc26d88cf 100644 --- a/transformers/tokenization_bert.py +++ b/transformers/tokenization_bert.py @@ -47,7 +47,7 @@ PRETRAINED_VOCAB_FILES_MAP = { 'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt", 'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt", 'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", - 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt", + 'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt", } } From e347725d8c2fea7a89605d02d1f465594cf7df85 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Tue, 17 Dec 2019 23:32:52 +0100 Subject: [PATCH 203/302] More fine-grained control over pipeline creation with config argument. --- transformers/pipelines.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index dec7843baf..bcb4d9e054 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -497,7 +497,7 @@ class QuestionAnsweringPipeline(Pipeline): 'score': score.item(), 'start': np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(), 'end': np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(), - 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]: feature.token_to_orig_map[e] + 1]) + 'answer': ' '.join(example.doc_tokens[feature.token_to_orig_map[s]:feature.token_to_orig_map[e] + 1]) } for s, e, score in zip(starts, ends, scores) ] @@ -612,7 +612,8 @@ SUPPORTED_TASKS = { } -def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: +def pipeline(task: str, model, config: Optional[Union[str, PretrainedConfig]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: @@ -637,13 +638,21 @@ def pipeline(task: str, model, config: Optional[PretrainedConfig] = None, tokeni task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] # Special handling for model conversion - from_tf = model.endswith('.h5') and not is_tf_available() - from_pt = model.endswith('.bin') and not is_torch_available() + if isinstance(model, str): + from_tf = model.endswith('.h5') and not is_tf_available() + from_pt = model.endswith('.bin') and not is_torch_available() - if from_tf: - logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. Trying to load the model with PyTorch.') - elif from_pt: - logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. Trying to load the model with Tensorflow.') + if from_tf: + logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. ' + 'Trying to load the model with PyTorch.') + elif from_pt: + logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. ' + 'Trying to load the model with Tensorflow.') + else: + from_tf = from_pt = False + + if isinstance(config, str): + config = PretrainedConfig.from_pretrained(config) if allocator.__name__.startswith('TF'): model = allocator.from_pretrained(model, config=config, from_pt=from_pt) From 641a8decdc6c34ce1837c9602fe84a65ec5b741a Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 18 Dec 2019 10:43:48 +0100 Subject: [PATCH 204/302] clean up code and add arbitrary number of return sequences --- transformers/configuration_utils.py | 25 +-- transformers/modeling_encoder_decoder.py | 95 ---------- transformers/modeling_utils.py | 163 +++++++++++------ transformers/tests/sampling_test.py | 213 ----------------------- 4 files changed, 119 insertions(+), 377 deletions(-) delete mode 100644 transformers/tests/sampling_test.py diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 8c3e0a9f9c..456af3341c 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -62,18 +62,19 @@ class PretrainedConfig(object): self.is_decoder = kwargs.pop('is_decoder', False) # Parameters for sequence generation - self.generate_max_length = kwargs.pop('generate_max_length', 20) - self.generate_do_sample = kwargs.pop('generate_do_sample', False) - self.generate_num_beams = kwargs.pop('generate_num_beams', 1) - self.generate_temperature = kwargs.pop('generate_temperature', 1.0) - self.generate_top_k = kwargs.pop('generate_top_k', 50) - self.generate_top_p = kwargs.pop('generate_top_p', 1.0) - self.generate_repetition_penalty = kwargs.pop('generate_repetition_penalty', 1.0) - self.generate_bos_token_id = kwargs.pop('generate_bos_token_id', 0) - self.generate_pad_token_id = kwargs.pop('generate_pad_token_id', 0) - self.generate_eos_token_ids = kwargs.pop('generate_eos_token_ids', 0) - self.generate_batch_size = kwargs.pop('generate_batch_size', 1) - self.generate_length_penalty = kwargs.pop('generate_length_penalty', 1.) + self.max_length = kwargs.pop('max_length', 20) + self.do_sample = kwargs.pop('do_sample', False) + self.num_beams = kwargs.pop('num_beams', 1) + self.temperature = kwargs.pop('temperature', 1.0) + self.top_k = kwargs.pop('top_k', 50) + self.top_p = kwargs.pop('top_p', 1.0) + self.repetition_penalty = kwargs.pop('repetition_penalty', 1.0) + self.bos_token_id = kwargs.pop('bos_token_id', 0) + self.pad_token_id = kwargs.pop('pad_token_id', 0) + self.eos_token_ids = kwargs.pop('eos_token_ids', 0) + self.batch_size = kwargs.pop('batch_size', 1) + self.length_penalty = kwargs.pop('length_penalty', 1.) + self.num_return_sequences = kwargs.pop('num_return_sequences', 1) def save_pretrained(self, save_directory): """ Save a configuration object to the directory `save_directory`, so that it diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index 3d8c812c2f..d69a75cc75 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -25,7 +25,6 @@ from torch import nn from tqdm import trange from .modeling_auto import AutoModel, AutoModelWithLMHead -from .modeling_utils import Sampler logger = logging.getLogger(__name__) @@ -203,100 +202,6 @@ class PreTrainedEncoderDecoder(nn.Module): return decoder_outputs + encoder_outputs - def decode( - self, - encoder_input_ids, - decoder_prompt_ids=None, - device=torch.device("cpu"), - length=10, - do_sample=False, - temperature=1.0, - k=9, - p=0., - repetition_penalty=1., - **kwargs - ): - """ Generic sequence generator for encoder-decoder models. - - For encoder-decoders the generation consists in: - - Performing a forward pass through the encoder once; - - Pass the encoder's hidden states to a decoding mechanism that - repeatedly calls the decoder to generate sequences. - - The method currently supports greedy decoding and sampling. See the - documentation of the `Sampler` class for more information about the - parameters related to sampling. - - Params: - **encoder_input_ids**: `torch.LongTensor` of shape (1, sequence_length) - The sequence to encode. - **decoder_prompt_ids**: (`optional`) `torch.LongTensor` of shape (1, sequence_length) - The sequence used as a prompt for the generation. If `None` the method initializes - it as an empty `torch.LongTensor` of shape (1,) - **device**: (`optional`) `torch.device` - The device on which the prompt_ids will be initialized if not provided. - **length**: (`optional`) int - The length of the sequence to be generated. - **do_sample**: (`optional`) bool - If set to `False` we use greedy decoding; otherwise sampling. - **temperature**: (`optional`) float - The value used to module the next token probabilities. - **k**: (`optional`) int - The parameter used for k-filtering. - **p**: (`optional`) float - The parameter for nucleus sampling. Must be between 0 and 1. - **repetition_penalty**: (`optional`) float - The parameter for repetition penalty. - """ - if decoder_prompt_ids is None: - decoder_prompt_ids = torch.tensor([[]], dtype=torch.long, device=device) - - # When the model does not have a LM head `get_output_embeddings` - # returns `None`. We use this mechanism to determine whether we - # should proceed with decoding or not. - if self.decoder.get_output_embeddings() is None: - raise AttributeError("You tried do generated sequences with a decoder that does not have a LM Head.") - - # The followings checks that the decoder is on the same device as the one - # that is specified. It only works for models that fit on one GPU. - decoder_device = next(self.decoder.parameters()).device - if decoder_device != decoder_prompt_ids.device: - warnings.warn( - "The decoder is not on the same device as the prompt. Expected {}, got {}.".format( - decoder_prompt_ids.device, decoder_device - ) - ) - - kwargs_encoder, kwargs_decoder = self.prepare_model_kwargs(**kwargs) - with torch.no_grad(): - encoder_outputs = self.encoder(encoder_input_ids, **kwargs) - encoder_hidden_states = encoder_outputs[0] - kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states - - sampler_config = { - "k": k, - "p": p, - "do_sample": do_sample, - "temperature": temperature, - "repetition_penalty": repetition_penalty, - } - return self._greedy_decode_or_sample( - decoder_prompt_ids, length, sampler_config, **kwargs_decoder - ) - - def _greedy_decode_or_sample(self, prompt_ids, length, sampler_config, **kwargs_decoder): - sampler = Sampler(**sampler_config) - with torch.no_grad(): - generated_sequence = prompt_ids - for _ in trange(length): - arguments = self.decoder._prepare_inputs_for_decoding(generated_sequence, **kwargs_decoder) - outputs = self.decoder(**arguments) - next_tokens_logits = outputs[0][:, -1, :] - next_tokens = sampler.get_one_token(next_tokens_logits, generated_sequence) - generated_sequence = torch.cat((generated_sequence, next_tokens), dim=1) - - return generated_sequence.squeeze(0) - @staticmethod def prepare_model_kwargs(**kwargs): """ Prepare the encoder and decoder's keyword arguments. diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 0e285c4f6b..6fa68a0db4 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -494,7 +494,7 @@ class PreTrainedModel(nn.Module): def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bos_token_id=None, pad_token_id=None, eos_token_ids=None, batch_size=None, - length_penalty=None, **kwargs): + length_penalty=None, num_return_sequences=None, **kwargs): """ Sequence generator for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling @@ -526,18 +526,19 @@ class PreTrainedModel(nn.Module): if self.get_output_embeddings() is None: raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") - max_length = max_length if max_length is not None else self.config.generate_max_length - do_sample = do_sample if do_sample is not None else self.config.generate_do_sample - num_beams = num_beams if num_beams is not None else self.config.generate_num_beams - temperature = temperature if temperature is not None else self.config.generate_temperature - top_k = top_k if top_k is not None else self.config.generate_top_k - top_p = top_p if top_p is not None else self.config.generate_top_p - repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.generate_repetition_penalty - bos_token_id = bos_token_id if bos_token_id is not None else self.config.generate_bos_token_id - pad_token_id = pad_token_id if pad_token_id is not None else self.config.generate_pad_token_id - eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.generate_eos_token_ids - batch_size = batch_size if batch_size is not None else self.config.generate_batch_size - length_penalty = length_penalty if length_penalty is not None else self.config.generate_length_penalty + max_length = max_length if max_length is not None else self.config.max_length + do_sample = do_sample if do_sample is not None else self.config.do_sample + num_beams = num_beams if num_beams is not None else self.config.num_beams + temperature = temperature if temperature is not None else self.config.temperature + top_k = top_k if top_k is not None else self.config.top_k + top_p = top_p if top_p is not None else self.config.top_p + repetition_penalty = repetition_penalty if repetition_penalty is not None else self.config.repetition_penalty + bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id + pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id + eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids + batch_size = batch_size if batch_size is not None else self.config.batch_size + length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty + num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size @@ -547,8 +548,8 @@ class PreTrainedModel(nn.Module): assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." - assert temperature > 0, "`temperature` should be positive." - assert isinstance(top_k, int) and top_k > 0, "`top_k` should be a strictely positive integer." + assert temperature > 0, "`temperature` should be strictely positive." + assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." assert isinstance(bos_token_id, int) and bos_token_id >= 0, "`bos_token_id` should be a positive integer." @@ -557,30 +558,41 @@ class PreTrainedModel(nn.Module): "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." assert isinstance(batch_size, int) and batch_size > 0, "`batch_size` should be a strictely positive integer." assert length_penalty > 0, "`length_penalty` should be strictely positive." + assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer." if input_ids is None: input_ids = torch.full((batch_size, 1), bos_token_id, dtype=torch.long, device=next(self.parameters()).device) else: - assert input_ids.dims() == 2, "Input prompt should be of shape (batch_size, sequence length)." + assert input_ids.dim() == 2, "Input prompt should be of shape (batch_size, sequence length)." # current position and vocab size cur_len = input_ids.shape[1] vocab_size = self.config.vocab_size if num_beams > 1: - return self._generate_beam_search(input_ids, cur_len, max_length, do_sample, length_penalty, - num_beams, pad_token_id, eos_token_ids, vocab_size, batch_size) - + return self._generate_beam_search(input_ids, cur_len, max_length, do_sample, + temperature, top_k, top_p, repetition_penalty, + pad_token_id, eos_token_ids, batch_size, + num_return_sequences, + length_penalty, num_beams, vocab_size) return self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size) + pad_token_id, eos_token_ids, batch_size, + num_return_sequences) def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size): - """ Generate a sentence without beam search (num_beams == 1). """ + pad_token_id, eos_token_ids, batch_size, + num_return_sequences): + """ Generate `num_return_sequences` sequences per batch example without beam search (num_beams == 1). + All returned sequence are generated independantly. + """ + # Expand input to num return sequences + input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len) + input_ids = input_ids.contiguous().view(batch_size*num_return_sequences, cur_len) # (batch_size*num_return_sequences, cur_len) + # current position / max lengths / length of generated sentences / unfinished sentences - unfinished_sents = input_ids.new(batch_size).fill_(1) + unfinished_sents = input_ids.new(batch_size*num_return_sequences).fill_(1) # cache compute states pasts = None @@ -592,9 +604,9 @@ class PreTrainedModel(nn.Module): # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: - for i in range(batch_size): - for _ in set(input_ids[i].tolist()): - next_token_logits[i, _] /= repetition_penalty + for i in range(batch_size*num_return_sequences): + for previous_tokens in set(input_ids[i].tolist()): + next_token_logits[i, previous_tokens] /= repetition_penalty if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) @@ -603,16 +615,16 @@ class PreTrainedModel(nn.Module): # Top-p/top-k filtering next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) # Sample - next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) + next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1).squeeze(1) else: # Greedy decoding - next_token = torch.argmax(next_token_logits, dim=-1).unsqueeze(-1) + next_token = torch.argmax(next_token_logits, dim=-1) # update generations and finished sentences tokens_to_add = next_token * unfinished_sents + pad_token_id * (1 - unfinished_sents) - input_ids = torch.cat([input_ids, tokens_to_add], dim=-1) + input_ids = torch.cat([input_ids, tokens_to_add.unsqueeze(-1)], dim=-1) for eos_token_id in eos_token_ids: - unfinished_sents.mul_(tokens_to_add.squeeze(-1).ne(eos_token_id).long()) + unfinished_sents.mul_(tokens_to_add.ne(eos_token_id).long()) cur_len = cur_len + 1 # stop when there is a in each sentence, or if we exceed the maximul length @@ -621,13 +633,24 @@ class PreTrainedModel(nn.Module): # add eos_token_ids to unfinished sentences if cur_len == max_length: - input_ids[:, -1].masked_fill_(unfinished_sents.byte(), eos_token_ids[0]) + input_ids[:, -1].masked_fill_(unfinished_sents.to(dtype=torch.bool), eos_token_ids[0]) + + if num_return_sequences != 1: + input_ids = input_ids.view(batch_size, num_return_sequences, -1) return input_ids - def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, length_penalty, - num_beams, pad_token_id, eos_token_ids, vocab_size, batch_size): - """ Generate a sentence with beam search. """ + def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, + temperature, top_k, top_p, repetition_penalty, + pad_token_id, eos_token_ids, batch_size, + num_return_sequences, + length_penalty, num_beams, vocab_size): + """ Generate `num_return_sequences` sequences per batch example with beam search. + We return the top-`num_return_sequences` beams. + `num_return_sequences` should be bigger than `num_beams` (we default to the min of both) + """ + num_return_sequences = min(num_return_sequences, num_beams) + # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) @@ -638,7 +661,7 @@ class PreTrainedModel(nn.Module): # scores for each sentence in the beam beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device) beam_scores[:, 1:] = -1e9 - beam_scores = beam_scores.view(-1) + beam_scores = beam_scores.view(-1) # shape (batch_size * num_beams,) # cache compute states pasts = None # self.prepare_pasts() @@ -648,18 +671,40 @@ class PreTrainedModel(nn.Module): while cur_len < max_length: model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) - scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) - scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) - scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) - assert scores.size() == (batch_size * num_beams, vocab_size) + scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) + scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) - # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) - _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858) + if repetition_penalty != 1.0: + for i in range(batch_size * num_beams): + for previous_tokens in set(input_ids[i].tolist()): + scores[i, previous_tokens] /= repetition_penalty - # re-organize to group the beam together (we are keeping top hypothesis accross beams) - _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) + if do_sample: + # Temperature (higher temperature => more likely to sample low probability tokens) + if temperature != 1.0: + scores = scores / temperature + # Top-p/top-k filtering + scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) # (batch_size * num_beams, vocab_size) + # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search) + next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2) # (batch_size * num_beams, 2) + # Compute next scores + _scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) + _scores = torch.gather(_scores, -1, next_words) # (batch_size * num_beams, 2) + next_scores = _scores + beam_scores[:, None].expand_as(_scores) # (batch_size * num_beams, 2) + # Match shape of greedy beam search + next_words = next_words.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) + next_scores = next_scores.view(batch_size, 2 * num_beams) # (batch_size, 2 * num_beams) + else: + # do greedy beam search + scores = F.log_softmax(scores, dim=-1) # (batch_size * num_beams, vocab_size) + assert scores.size() == (batch_size * num_beams, vocab_size) + # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product) + _scores = scores + beam_scores[:, None].expand_as(scores) # (batch_size * num_beams, vocab_size) + # re-organize to group the beam together (we are keeping top hypothesis accross beams) + _scores = _scores.view(batch_size, num_beams * vocab_size) # (batch_size, num_beams * vocab_size) + next_scores, next_words = torch.topk(_scores, 2*num_beams, dim=1, largest=True, sorted=True) - next_scores, next_words = torch.topk(_scores, 2 * num_beams, dim=1, largest=True, sorted=True) assert next_scores.size() == next_words.size() == (batch_size, 2 * num_beams) # next batch beam content @@ -733,32 +778,36 @@ class PreTrainedModel(nn.Module): # print("") # select the best hypotheses - tgt_len = input_ids.new(batch_size) - best = [] + tgt_len = input_ids.new(batch_size, num_return_sequences) + bests = [] for i, hypotheses in enumerate(generated_hyps): - best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] - tgt_len[i] = len(best_hyp) + 1 # +1 for the symbol - best.append(best_hyp) + best_hyps = [hyp[1] for hyp in sorted(hypotheses.hyp, key=lambda hyp: hyp[0])[-num_return_sequences:]] + for j, hyp in enumerate(best_hyps): + tgt_len[i, j] = len(hyp) + 1 # +1 for the symbol + bests.append(best_hyps) # generate target batch - decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) - for i, hypo in enumerate(best): - decoded[i, :tgt_len[i] - 1] = hypo - decoded[i, tgt_len[i] - 1] = eos_token_ids[0] + decoded = input_ids.new(batch_size, num_return_sequences, tgt_len.max().item()).fill_(pad_token_id) + for i, hyps in enumerate(bests): + for j, hypo in enumerate(hyps): + decoded[i, j, :tgt_len[i, j] - 1] = hypo + decoded[i, j, tgt_len[i, j] - 1] = eos_token_ids[0] + if num_return_sequences == 1: + decoded = decoded.squeeze(1) # # sanity check # assert (decoded == eos_token_ids[0]).sum() == 2 * batch_size return decoded -def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')): +def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf')): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: logits: logits distribution shape (batch size x vocabulary size) - top_k > 0: keep only top k tokens with highest probability (top-k filtering). - top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). + if top_k > 0: keep only top k tokens with highest probability (top-k filtering). + if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ @@ -768,7 +817,7 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf') indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value - if top_p > 0.0: + if top_p < 1.0: sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) diff --git a/transformers/tests/sampling_test.py b/transformers/tests/sampling_test.py deleted file mode 100644 index 98cc23bf2b..0000000000 --- a/transformers/tests/sampling_test.py +++ /dev/null @@ -1,213 +0,0 @@ -# coding=utf-8 -import sys -import unittest - -import numpy as np -import pytest - -from transformers import is_torch_available - -if is_torch_available(): - import torch - - from transformers import ( - BertConfig, - BertModel, - GPT2Config, - GPT2LMHeadModel, - OpenAIGPTConfig, - OpenAIGPTLMHeadModel, - TransfoXLConfig, - TransfoXLLMHeadModel, - XLMConfig, - XLMWithLMHeadModel, - XLNetConfig, - XLNetLMHeadModel, - Model2Model, - ) - from transformers.modeling_utils import Sampler -else: - pytestmark = pytest.mark.skip("Require Torch") - - -class SamplerTest(unittest.TestCase): - def test_nucleus_sampling(self): - inf = -float("Inf") - test_cases = ( - { - "p": 0, - "logits": torch.tensor([0.3, 0.1, 0.2]), - "expected": torch.tensor([0.3, 0.1, 0.2]), - }, - { - "p": 0.01, - "logits": torch.tensor([0.3, 0.1, 0.2]), - "expected": torch.tensor([0.3, inf, inf]), - }, - { - "p": 1, - "logits": torch.tensor([0.3, 0.1, 0.2]), - "expected": torch.tensor([0.3, 0.1, 0.2]), - }, - { - "p": 0.2, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, inf, inf]), - }, - { - "p": 0.71, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, inf, 0.2]), - }, - { - "p": 0.71, - "logits": torch.tensor([0.1, 0.7, 0.2]), - "expected": torch.tensor([inf, 0.7, 0.2]), - }, - { - "p": 0.71, - "logits": torch.tensor([0.7, 0.2, 0.1]), - "expected": torch.tensor([0.7, 0.2, inf]), - }, - { - "p": 0.91, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, 0.1, 0.2]), - }, - ) - for case in test_cases: - config = { - "do_sample": True, - "temperature": 1.0, - "k": 0, - "p": case["p"], - "repetition_penalty": 1.0, - } - sampler = Sampler(**config) - filtered_logits = sampler.apply_nucleus_filter(case["logits"]) - np.testing.assert_array_equal(case["expected"].numpy(), filtered_logits.numpy()) - - def test_top_k_filter(self): - inf = -float("Inf") - test_cases = ( - { - "k": 0, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, 0.1, 0.2]), - }, - { - "k": 1, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, inf, inf]), - }, - { - "k": 2, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, inf, 0.2]), - }, - { - "k": 3, - "logits": torch.tensor([0.7, 0.1, 0.2]), - "expected": torch.tensor([0.7, 0.1, 0.2]), - }, - ) - for case in test_cases: - config = { - "do_sample": True, - "temperature": 1.0, - "k": case["k"], - "p": 0, - "repetition_penalty": 1.0, - } - sampler = Sampler(**config) - filtered_logits = sampler.apply_top_k_filter(case["logits"]) - np.testing.assert_array_equal(case["expected"].numpy(), filtered_logits.numpy()) - - @pytest.mark.skipif(sys.version_info < (3, 2), reason="assertWarns() requires Python >= 3.2") - def test_wrong_k_value(self): - case = {"k": 10, "vocab_size": 5} - config = { - "do_sample": True, - "temperature": 1.0, - "k": case["k"], - "p": 0, - "repetition_penalty": 1.0, - } - sampler = Sampler(**config) - next_token_logits = torch.rand(case["vocab_size"]).unsqueeze(0) - past_sequence = torch.tensor([]) - with self.assertWarns(UserWarning): - _ = sampler.get_one_token(next_token_logits, past_sequence) - - def test_zero_temperature(self): - temperature = 0 - config = { - "do_sample": True, - "temperature": temperature, - "k": 0, - "p": 0, - "repetition_penalty": 1.0, - } - sampler = Sampler(**config) - next_token_logits = torch.rand(10).unsqueeze(0) - past_sequence = torch.tensor([]) - with self.assertRaises(ZeroDivisionError): - _ = sampler.get_one_token(next_token_logits, past_sequence) - - -class SamplerSingleStackTest(unittest.TestCase): - def test_raises_exception_when_no_LM_head(self): - models = [BertModel(BertConfig())] - for model in models: - with self.assertRaises(AttributeError): - model.decode() - - @pytest.mark.slow - def test_forward_pass_and_output_length(self): - models = { - "XLNet": XLNetLMHeadModel(XLNetConfig()), - "XLM": XLMWithLMHeadModel(XLMConfig()), - "TransfoXL": TransfoXLLMHeadModel(TransfoXLConfig()), - "GPT2": GPT2LMHeadModel(GPT2Config()), - "GPT": OpenAIGPTLMHeadModel(OpenAIGPTConfig()), - } - kwargs = { - "XLNet": {}, - "XLM": {"mask_token": 0}, - "TransfoXL": {}, - "GPT2": {}, - "GPT": {}, - } - prompt = torch.tensor([[1, 2, 3]], dtype=torch.long) - generated_length = 5 - expected_length = 8 - - for name, model in models.items(): - kwargs_model = kwargs[name] - output = model.decode(prompt_ids=prompt, length=generated_length, **kwargs_model) - self.assertEqual(len(output), expected_length) - - -class SamplerEncoderDecoderTest(unittest.TestCase): - @pytest.mark.slow - def test_forward_pass_and_output_length(self): - model = Model2Model.from_pretrained("bert-base-uncased") - - encoder_input_ids = torch.tensor([[1, 2, 3]], dtype=torch.long) - prompt = torch.tensor([[1, 2, 3]], dtype=torch.long) - generated_length = 5 - expected_length = 8 - - output = model.decode( - encoder_input_ids, - decoder_prompt_ids=prompt, - k=2, - p=0.5, - repetition_penalty=2, - length=generated_length, - ) - self.assertEqual(len(output), expected_length) - - -if __name__ == "__main__": - unittest.main() From 8e5587fb7935e3040c11118041ff729d33adcb09 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 18 Dec 2019 11:32:37 +0100 Subject: [PATCH 205/302] few fixes on sampling --- transformers/modeling_utils.py | 95 +++++++++++++++------------------- 1 file changed, 42 insertions(+), 53 deletions(-) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 6fa68a0db4..bbfb0614ad 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -23,14 +23,12 @@ import json import logging import os from io import open -import warnings import six import torch from torch import nn from torch.nn import CrossEntropyLoss from torch.nn import functional as F -from tqdm import trange from .configuration_utils import PretrainedConfig from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME @@ -82,7 +80,6 @@ class PreTrainedModel(nn.Module): "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format( self.__class__.__name__, self.__class__.__name__ )) - # Save config in model self.config = config @@ -220,9 +217,6 @@ class PreTrainedModel(nn.Module): # Tie weights if needed self.tie_weights() - # Initialize decoding head if we have output embeddings - - def prune_heads(self, heads_to_prune): """ Prunes heads of the base model. @@ -569,30 +563,36 @@ class PreTrainedModel(nn.Module): cur_len = input_ids.shape[1] vocab_size = self.config.vocab_size + if num_return_sequences != 1: + # Expand input to num return sequences + input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len) + input_ids = input_ids.contiguous().view(batch_size * num_return_sequences, cur_len) # (batch_size * num_return_sequences, cur_len) + effective_batch_size = batch_size * num_return_sequences + else: + effective_batch_size = batch_size + if num_beams > 1: - return self._generate_beam_search(input_ids, cur_len, max_length, do_sample, - temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size, - num_return_sequences, - length_penalty, num_beams, vocab_size) - return self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, + output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample, + temperature, top_k, top_p, repetition_penalty, + pad_token_id, eos_token_ids, effective_batch_size, + length_penalty, num_beams, vocab_size) + else: + output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size, - num_return_sequences) + pad_token_id, eos_token_ids, effective_batch_size) + + if num_return_sequences != 1: + output = output.view(batch_size, num_return_sequences, -1) + return output def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size, - num_return_sequences): - """ Generate `num_return_sequences` sequences per batch example without beam search (num_beams == 1). + pad_token_id, eos_token_ids, batch_size): + """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ - # Expand input to num return sequences - input_ids = input_ids.unsqueeze(1).expand(batch_size, num_return_sequences, cur_len) - input_ids = input_ids.contiguous().view(batch_size*num_return_sequences, cur_len) # (batch_size*num_return_sequences, cur_len) - # current position / max lengths / length of generated sentences / unfinished sentences - unfinished_sents = input_ids.new(batch_size*num_return_sequences).fill_(1) + unfinished_sents = input_ids.new(batch_size).fill_(1) # cache compute states pasts = None @@ -604,7 +604,7 @@ class PreTrainedModel(nn.Module): # repetition penalty from CTRL paper (https://arxiv.org/abs/1909.05858) if repetition_penalty != 1.0: - for i in range(batch_size*num_return_sequences): + for i in range(batch_size): for previous_tokens in set(input_ids[i].tolist()): next_token_logits[i, previous_tokens] /= repetition_penalty @@ -635,22 +635,14 @@ class PreTrainedModel(nn.Module): if cur_len == max_length: input_ids[:, -1].masked_fill_(unfinished_sents.to(dtype=torch.bool), eos_token_ids[0]) - if num_return_sequences != 1: - input_ids = input_ids.view(batch_size, num_return_sequences, -1) - return input_ids def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, pad_token_id, eos_token_ids, batch_size, - num_return_sequences, length_penalty, num_beams, vocab_size): - """ Generate `num_return_sequences` sequences per batch example with beam search. - We return the top-`num_return_sequences` beams. - `num_return_sequences` should be bigger than `num_beams` (we default to the min of both) + """ Generate sequences for each example with beam search. """ - num_return_sequences = min(num_return_sequences, num_beams) - # Expand input to num beams input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams, cur_len) input_ids = input_ids.contiguous().view(batch_size * num_beams, cur_len) # (batch_size * num_beams, cur_len) @@ -685,7 +677,7 @@ class PreTrainedModel(nn.Module): if temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering - scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p) # (batch_size * num_beams, vocab_size) + scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2) # (batch_size * num_beams, vocab_size) # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search) next_words = torch.multinomial(F.softmax(scores, dim=-1), num_samples=2) # (batch_size * num_beams, 2) # Compute next scores @@ -778,41 +770,35 @@ class PreTrainedModel(nn.Module): # print("") # select the best hypotheses - tgt_len = input_ids.new(batch_size, num_return_sequences) - bests = [] + tgt_len = input_ids.new(batch_size) + best = [] for i, hypotheses in enumerate(generated_hyps): - best_hyps = [hyp[1] for hyp in sorted(hypotheses.hyp, key=lambda hyp: hyp[0])[-num_return_sequences:]] - for j, hyp in enumerate(best_hyps): - tgt_len[i, j] = len(hyp) + 1 # +1 for the symbol - bests.append(best_hyps) + best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1] + tgt_len[i] = len(best_hyp) + 1 # +1 for the symbol + best.append(best_hyp) # generate target batch - decoded = input_ids.new(batch_size, num_return_sequences, tgt_len.max().item()).fill_(pad_token_id) - for i, hyps in enumerate(bests): - for j, hypo in enumerate(hyps): - decoded[i, j, :tgt_len[i, j] - 1] = hypo - decoded[i, j, tgt_len[i, j] - 1] = eos_token_ids[0] - - if num_return_sequences == 1: - decoded = decoded.squeeze(1) - # # sanity check - # assert (decoded == eos_token_ids[0]).sum() == 2 * batch_size + decoded = input_ids.new(batch_size, tgt_len.max().item()).fill_(pad_token_id) + for i, hypo in enumerate(best): + decoded[i, :tgt_len[i] - 1] = hypo + decoded[i, tgt_len[i] - 1] = eos_token_ids[0] return decoded -def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf')): +def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf'), min_tokens_to_keep=1): """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering Args: - logits: logits distribution shape (batch size x vocabulary size) + logits: logits distribution shape (batch size, vocabulary size) if top_k > 0: keep only top k tokens with highest probability (top-k filtering). if top_p < 1.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering). Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751) + Make sure we keep at least min_tokens_to_keep per batch example in the output From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317 """ - top_k = min(top_k, logits.size(-1)) # Safety check if top_k > 0: + top_k = min(max(top_k, min_tokens_to_keep), logits.size(-1)) # Safety check # Remove all tokens with a probability less than the last token of the top-k indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None] logits[indices_to_remove] = filter_value @@ -821,8 +807,11 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=1.0, filter_value=-float('Inf') sorted_logits, sorted_indices = torch.sort(logits, descending=True) cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1) - # Remove tokens with cumulative probability above the threshold + # Remove tokens with cumulative probability above the threshold (token with 0 are kept) sorted_indices_to_remove = cumulative_probs > top_p + if min_tokens_to_keep > 1: + # Keep at least min_tokens_to_keep (set to min_tokens_to_keep-1 because we add the first one below) + sorted_indices_to_remove[..., :min_tokens_to_keep] = 0 # Shift the indices to the right to keep also the first token above the threshold sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone() sorted_indices_to_remove[..., 0] = 0 From ca31abc6d6fe35a39703ed36775853595149e956 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 11:36:54 +0100 Subject: [PATCH 206/302] tokenization: *align* fairseq and spm vocab to fix some tokenization errors --- transformers/tokenization_xlm_roberta.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index 0f95397606..d8484e7f9c 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -61,7 +61,19 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): self.sp_model = spm.SentencePieceProcessor() self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file - self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2} + + # Original fairseq vocab and spm vocab must be "aligned": + # Vocab | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 + # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ---- + # fairseq | '' | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' + # spm | '' | '' | '' | ',' | '.' | '▁' | 's' | '▁de' | '-' | '▁a' + + # Mimic fairseq token-to-id alignment for the first 4 token + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + + # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab + self.fairseq_offset = 1 + self.fairseq_tokens_to_ids[''] = len(self.sp_model) + len(self.fairseq_tokens_to_ids) self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} @@ -131,13 +143,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): """ Converts a token (str/unicode) in an id using the vocab. """ if token in self.fairseq_tokens_to_ids: return self.fairseq_tokens_to_ids[token] - return self.sp_model.PieceToId(token) + 1 + return self.sp_model.PieceToId(token) + self.fairseq_offset def _convert_id_to_token(self, index): """Converts an index (integer) in a token (string/unicode) using the vocab.""" if index in self.fairseq_ids_to_tokens: return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index + 1) + return self.sp_model.IdToPiece(index - self.fairseq_offset) def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file From 3d2096f516e99da79f1c6c60a48f828b4e7733ef Mon Sep 17 00:00:00 2001 From: thomwolf Date: Wed, 18 Dec 2019 11:50:54 +0100 Subject: [PATCH 207/302] further cleanup --- examples/run_generation.py | 13 ++++--- transformers/configuration_xlm.py | 4 +++ transformers/modeling_utils.py | 18 ++++++---- transformers/modeling_xlm.py | 39 +++++++------------- transformers/modeling_xlnet.py | 58 +++++++++++++----------------- transformers/tokenization_utils.py | 2 +- 6 files changed, 58 insertions(+), 76 deletions(-) diff --git a/examples/run_generation.py b/examples/run_generation.py index 2075ad8457..8121f4f5aa 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -91,7 +91,7 @@ def prepare_ctrl_input(args, _, tokenizer, prompt_text): def prepare_xlm_input(args, model, tokenizer, prompt_text): - kwargs = {"language": None, "mask_token": None} + kwargs = {"language": None, "mask_token_id": None} # Set the language use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb @@ -112,7 +112,7 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text): # XLM masked-language modeling (MLM) models need masked token is_xlm_mlm = "mlm" in args.model_name_or_path if is_xlm_mlm: - kwargs["mask_token"] = tokenizer.mask_token_id + kwargs["mask_token_id"] = tokenizer.mask_token_id return prompt_text, kwargs @@ -204,14 +204,13 @@ def main(): prompt_text, model_kwargs = prepare_input(args, model, tokenizer, prompt_text) encoded_prompt = torch.tensor(tokenizer.encode(prompt_text, add_special_tokens=False)).unsqueeze(0) - output_sequences = model.decode( - prompt_ids=encoded_prompt, + output_sequences = model.generate( + intput_ids=encoded_prompt, length=args.length, temperature=args.temperature, - k=args.k, - p=args.p, + top_k=args.k, + top_p=args.p, repetition_penalty=args.repetition_penalty, - device=args.device, **model_kwargs, ) diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py index fa3a5f40f6..1938b85741 100644 --- a/transformers/configuration_xlm.py +++ b/transformers/configuration_xlm.py @@ -113,6 +113,8 @@ class XLMConfig(PretrainedConfig): summary_first_dropout=0.1, start_n_top=5, end_n_top=5, + mask_token_id = 0, + lang_id = 0, **kwargs): """Constructs XLMConfig. """ @@ -156,6 +158,8 @@ class XLMConfig(PretrainedConfig): self.summary_first_dropout = summary_first_dropout self.start_n_top = start_n_top self.end_n_top = end_n_top + self.mask_token_id = mask_token_id + self.lang_id = lang_id else: raise ValueError("First argument must be either a vocabulary size (int)" " or the path to a pretrained model config file (str)") diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index bbfb0614ad..f55c209ac0 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -488,7 +488,7 @@ class PreTrainedModel(nn.Module): def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bos_token_id=None, pad_token_id=None, eos_token_ids=None, batch_size=None, - length_penalty=None, num_return_sequences=None, **kwargs): + length_penalty=None, num_return_sequences=None, **model_kwargs): """ Sequence generator for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling @@ -575,11 +575,13 @@ class PreTrainedModel(nn.Module): output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, pad_token_id, eos_token_ids, effective_batch_size, - length_penalty, num_beams, vocab_size) + length_penalty, num_beams, vocab_size, + **model_kwargs) else: output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, effective_batch_size) + pad_token_id, eos_token_ids, effective_batch_size, + **model_kwargs) if num_return_sequences != 1: output = output.view(batch_size, num_return_sequences, -1) @@ -587,7 +589,8 @@ class PreTrainedModel(nn.Module): def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size): + pad_token_id, eos_token_ids, batch_size, + **model_kwargs): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ @@ -598,7 +601,7 @@ class PreTrainedModel(nn.Module): pasts = None while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts, **model_kwargs) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] @@ -640,7 +643,8 @@ class PreTrainedModel(nn.Module): def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, pad_token_id, eos_token_ids, batch_size, - length_penalty, num_beams, vocab_size): + length_penalty, num_beams, vocab_size, + **model_kwargs): """ Generate sequences for each example with beam search. """ # Expand input to num beams @@ -662,7 +666,7 @@ class PreTrainedModel(nn.Module): done = [False for _ in range(batch_size)] while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts, **model_kwargs) scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 295fff7943..6691b0f60b 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -639,6 +639,18 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): def get_output_embeddings(self): return self.pred_layer.proj + def prepare_inputs_for_generation(self, input_ids, **model_kwargs): + mask_token_id = model_kwargs['mask_token_id'] if 'mask_token_id' in model_kwargs else self.config.mask_token_id + lang_id = model_kwargs['lang_id'] if 'lang_id' in model_kwargs else self.config.lang_id + + mask_token = torch.full((1, 1), mask_token_id, dtype=torch.long, device=input_ids.device) + input_ids = torch.cat([input_ids, mask_token], dim=1) + if lang_id is not None: + langs = torch.full_like(input_ids, lang_id) + else: + langs = None + return {"input_ids": input_ids, "langs": langs} + def forward(self, input_ids=None, attention_mask=None, langs=None, token_type_ids=None, position_ids=None, lengths=None, cache=None, head_mask=None, inputs_embeds=None, labels=None): transformer_outputs = self.transformer(input_ids, @@ -657,33 +669,6 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): return outputs - def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): - mask_token = model_kwargs.pop("mask_token", None) - language = model_kwargs.pop("language", None) - input_ids = self._append_mask_token(input_ids, mask_token) - langs = self._create_language_embeddings(input_ids, language) - arguments = {"input_ids": input_ids, "langs": langs} - arguments.update(model_kwargs) - - return arguments - - @staticmethod - def _append_mask_token(sequence, mask_token_id): - """ Append a [MASK] token at the end of the sequence that the MLM model - is going to try to predict. - """ - if mask_token_id is not None: - tokens_to_append = torch.full((1, 1), mask_token_id, dtype=torch.long) - return torch.cat((sequence, tokens_to_append), dim=1) - - return sequence - - @staticmethod - def _create_language_embeddings(sequence, language): - if language is not None: - return torch.tensor([language] * sequence.shape[1]).view(1, -1) - return None - @add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py index 2153923dd2..26b95076cd 100644 --- a/transformers/modeling_xlnet.py +++ b/transformers/modeling_xlnet.py @@ -947,6 +947,30 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): def get_output_embeddings(self): return self.lm_loss + def prepare_inputs_for_generation(self, input_ids, **model_kwargs): + # Add dummy token at the end (no attention on this one) + dummy_token = torch.zeros((1, 1), dtype=torch.long, device=input_ids.device) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + # Build permutation mask so that previous tokens don't see last token + perm_mask = torch.zeros( + (input_ids.shape[0], input_ids.shape[1], input_ids.shape[1]), + dtype=torch.float, device=input_ids.device + ) + perm_mask[:, :, -1] = 1.0 + + # We'll only predict the last token + target_mapping = torch.zeros( + (input_ids.shape[0], 1, input_ids.shape[1]), + dtype=torch.float, device=input_ids.device + ) + target_mapping[0, 0, -1] = 1.0 + + return {"input_ids": input_ids, + "perm_mask": perm_mask, + "target_mapping": target_mapping + } + def forward(self, input_ids=None, attention_mask=None, mems=None, perm_mask=None, target_mapping=None, token_type_ids=None, input_mask=None, head_mask=None, inputs_embeds=None, labels=None): transformer_outputs = self.transformer(input_ids, @@ -972,40 +996,6 @@ class XLNetLMHeadModel(XLNetPreTrainedModel): return outputs # return (loss), logits, (mems), (hidden states), (attentions) - def _prepare_inputs_for_decoding(self, input_ids, **model_kwargs): - input_ids = self._add_dummy_token(input_ids) - perm_mask = self._create_perm_mask(input_ids) - target_mapping = self._create_target_mapping(input_ids) - arguments = { - "input_ids": input_ids, - "perm_mask": perm_mask, - "target_mapping": target_mapping, - } - return arguments - - @staticmethod - def _add_dummy_token(sequence): - dummy = torch.zeros((sequence.size(0), 1), dtype=torch.long) - return torch.cat((sequence, dummy), dim=1) - - @staticmethod - def _create_perm_mask(sequence): - mask = torch.zeros( - (sequence.shape[0], sequence.shape[1], sequence.shape[1]), - dtype=torch.float, - ) - mask[:, :, -1] = 1.0 # Previous tokens don't see last token - return mask - - @staticmethod - def _create_target_mapping(sequence): - target_mapping = torch.zeros( - (sequence.shape[0], 1, sequence.shape[1]), - dtype=torch.float, - ) - target_mapping[0, 0, -1] = 1.0 # predict last token - return target_mapping - @add_start_docstrings("""XLNet Model with a sequence classification/regression head on top (a linear layer on top of the pooled output) e.g. for GLUE tasks. """, diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index f4395cd82c..2e0d6caef2 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -761,7 +761,7 @@ class PreTrainedTokenizer(object): padding index, up to their max length. If no max length is specified, the padding is done up to the model's max length. The tokenizer padding sides are handled by the following strings: - 'left': pads on the left of the sequences - - 'right': pads on the right of the sequences + - 'right': pads on the right of the sequences Defaults to False: no padding. return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant or PyTorch torch.Tensor instead of a list of python integers. From 01b68be34f906a210b0c0a5e2bd9bc605c5983f2 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 12:24:46 +0100 Subject: [PATCH 208/302] converter: remove XLM-RoBERTa specific script (can be done with the script for RoBERTa now) --- ..._original_pytorch_checkpoint_to_pytorch.py | 184 ------------------ 1 file changed, 184 deletions(-) delete mode 100644 transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py deleted file mode 100644 index 884c273d2c..0000000000 --- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py +++ /dev/null @@ -1,184 +0,0 @@ -# coding=utf-8 -# Copyright 2018 The HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Convert RoBERTa checkpoint.""" - -from __future__ import absolute_import, division, print_function - -import argparse -import logging -import numpy as np -import torch -import pathlib - -from fairseq.models.roberta import RobertaModel as FairseqRobertaModel -from fairseq.modules import TransformerSentenceEncoderLayer -from transformers.modeling_bert import (BertConfig, BertEncoder, - BertIntermediate, BertLayer, - BertModel, BertOutput, - BertSelfAttention, - BertSelfOutput) -from transformers.modeling_roberta import (RobertaEmbeddings, - RobertaForMaskedLM, - RobertaForSequenceClassification, - RobertaModel) - -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -SAMPLE_TEXT = 'Hello world! cécé herlolip' - - -def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head): - """ - Copy/paste/tweak roberta's weights to our BERT structure. - """ - roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece') - roberta.eval() # disable dropout - config = BertConfig( - vocab_size=250002, - hidden_size=roberta.args.encoder_embed_dim, - num_hidden_layers=roberta.args.encoder_layers, - num_attention_heads=roberta.args.encoder_attention_heads, - intermediate_size=roberta.args.encoder_ffn_embed_dim, - max_position_embeddings=514, - type_vocab_size=1, - layer_norm_eps=1e-5, # PyTorch default used in fairseq - ) - if classification_head: - config.num_labels = roberta.args.num_classes - print("Our BERT config:", config) - - model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config) - model.eval() - - # Now let's copy all the weights. - # Embeddings - roberta_sent_encoder = roberta.model.decoder.sentence_encoder - model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight - model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight - model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight) # just zero them out b/c RoBERTa doesn't use them. - model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight - model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias - - for i in range(config.num_hidden_layers): - # Encoder: start of layer - layer: BertLayer = model.roberta.encoder.layer[i] - roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i] - - ### self attention - self_attn: BertSelfAttention = layer.attention.self - assert( - roberta_layer.self_attn.k_proj.weight.data.shape == \ - roberta_layer.self_attn.q_proj.weight.data.shape == \ - roberta_layer.self_attn.v_proj.weight.data.shape == \ - torch.Size((config.hidden_size, config.hidden_size)) - ) - - self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight - self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias - self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight - self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias - self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight - self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias - - ### self-attention output - self_output: BertSelfOutput = layer.attention.output - assert( - self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape - ) - self_output.dense.weight = roberta_layer.self_attn.out_proj.weight - self_output.dense.bias = roberta_layer.self_attn.out_proj.bias - self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight - self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias - - ### intermediate - intermediate: BertIntermediate = layer.intermediate - assert( - intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape - ) - intermediate.dense.weight = roberta_layer.fc1.weight - intermediate.dense.bias = roberta_layer.fc1.bias - - ### output - bert_output: BertOutput = layer.output - assert( - bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape - ) - bert_output.dense.weight = roberta_layer.fc2.weight - bert_output.dense.bias = roberta_layer.fc2.bias - bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight - bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias - #### end of layer - - if classification_head: - model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight - model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias - model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight - model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias - else: - # LM Head - model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight - model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias - model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight - model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias - model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight - model.lm_head.bias = roberta.model.decoder.lm_head.bias - - # Let's check that we get the same results. - input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1 - - our_output = model(input_ids)[0] - if classification_head: - their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids)) - else: - their_output = roberta.model(input_ids)[0] - print(our_output.shape, their_output.shape) - max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item() - print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7 - success = torch.allclose(our_output, their_output, atol=1e-3) - print( - "Do both models output the same tensors?", - "🔥" if success else "💩" - ) - if not success: - raise Exception("Something went wRoNg") - - pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True) - print(f"Saving model to {pytorch_dump_folder_path}") - model.save_pretrained(pytorch_dump_folder_path) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - ## Required parameters - parser.add_argument("--roberta_checkpoint_path", - default = None, - type = str, - required = True, - help = "Path the official PyTorch dump.") - parser.add_argument("--pytorch_dump_folder_path", - default = None, - type = str, - required = True, - help = "Path to the output PyTorch model.") - parser.add_argument("--classification_head", - action = "store_true", - help = "Whether to convert a final classification head.") - args = parser.parse_args() - convert_roberta_checkpoint_to_pytorch( - args.roberta_checkpoint_path, - args.pytorch_dump_folder_path, - args.classification_head - ) From 8efc6dd544bf1a30d99d4b5abfc5e214699eab2b Mon Sep 17 00:00:00 2001 From: Lysandre Date: Wed, 18 Dec 2019 10:47:59 -0500 Subject: [PATCH 209/302] fix #2214 --- transformers/configuration_xlm.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py index 0740cc4026..6839a45746 100644 --- a/transformers/configuration_xlm.py +++ b/transformers/configuration_xlm.py @@ -144,6 +144,9 @@ class XLMConfig(PretrainedConfig): self.start_n_top = start_n_top self.end_n_top = end_n_top + if "n_words" in kwargs: + self.n_words = kwargs["n_words"] + @property def n_words(self): # For backward compatibility return self.vocab_size From 0c88c856d592134ee5a9a636f9b73f40b91784b5 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 18 Dec 2019 18:18:16 +0100 Subject: [PATCH 210/302] Unnest QuestionAnsweringArgumentHandler --- transformers/pipelines.py | 62 +++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index bcb4d9e054..a10078b027 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -333,6 +333,63 @@ class NerPipeline(Pipeline): return answers +class QuestionAnsweringArgumentHandler(ArgumentHandler): + """ + QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped + to internal SquadExample / SquadFeature structures. + + QuestionAnsweringArgumentHandler manages all the possible to create SquadExample from the command-line supplied + arguments. + """ + def __call__(self, *args, **kwargs): + # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating + if args is not None and len(args) > 0: + if len(args) == 1: + kwargs['X'] = args[0] + else: + kwargs['X'] = list(args) + + # Generic compatibility with sklearn and Keras + # Batched data + if 'X' in kwargs or 'data' in kwargs: + data = kwargs['X'] if 'X' in kwargs else kwargs['data'] + + if not isinstance(data, list): + data = [data] + + for i, item in enumerate(data): + if isinstance(item, dict): + if any(k not in item for k in ['question', 'context']): + raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') + data[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif isinstance(item, SquadExample): + continue + else: + raise ValueError( + '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' + .format('X' if 'X' in kwargs else 'data') + ) + inputs = data + + # Tabular input + elif 'question' in kwargs and 'context' in kwargs: + if isinstance(kwargs['question'], str): + kwargs['question'] = [kwargs['question']] + + if isinstance(kwargs['context'], str): + kwargs['context'] = [kwargs['context']] + + inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] + else: + raise ValueError('Unknown arguments {}'.format(kwargs)) + + if not isinstance(inputs, list): + inputs = [inputs] + + return inputs + + class QuestionAnsweringPipeline(Pipeline): """ Question Answering pipeline using ModelForQuestionAnswering head. @@ -403,8 +460,9 @@ class QuestionAnsweringPipeline(Pipeline): else: return SquadExample(None, question, context, None, None, None) - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer]): - super().__init__(model, tokenizer, args_parser=QuestionAnsweringPipeline.QuestionAnsweringArgumentHandler()) + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer], device: int = -1, **kwargs): + super().__init__(model, tokenizer, args_parser=QuestionAnsweringArgumentHandler(), + device=device, **kwargs) def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: """ From 41a13a6375817ec3836bedcec67d12ad32bf0956 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 18:20:27 +0100 Subject: [PATCH 211/302] auto: add XLMRoBERTa to auto configuration --- transformers/configuration_auto.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py index 9fe58f173a..29e1a0ee1d 100644 --- a/transformers/configuration_auto.py +++ b/transformers/configuration_auto.py @@ -30,6 +30,7 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP +from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP logger = logging.getLogger(__name__) @@ -48,6 +49,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value) ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP, + XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ] for key, value, in pretrained_map.items()) @@ -66,6 +68,7 @@ class AutoConfig(object): - contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `albert`: AlbertConfig (ALBERT model) - contains `camembert`: CamembertConfig (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `roberta`: RobertaConfig (RoBERTa model) - contains `bert`: BertConfig (Bert model) - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) @@ -91,6 +94,7 @@ class AutoConfig(object): - contains `distilbert`: DistilBertConfig (DistilBERT model) - contains `albert`: AlbertConfig (ALBERT model) - contains `camembert`: CamembertConfig (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model) - contains `roberta`: RobertaConfig (RoBERTa model) - contains `bert`: BertConfig (Bert model) - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model) @@ -152,6 +156,8 @@ class AutoConfig(object): return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'camembert' in pretrained_model_name_or_path: return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -170,4 +176,4 @@ class AutoConfig(object): return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) + "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) From 036831e2791b84edb5a89db7a92af7c69f4ff37e Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 18:23:42 +0100 Subject: [PATCH 212/302] auto: add XLM-RoBERTa to audo modeling --- transformers/modeling_auto.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 1a30ea4623..ca6c4525b8 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -30,6 +30,7 @@ from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForMultipleChoice, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_utils import PreTrainedModel, SequenceSummary @@ -52,6 +53,7 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value) ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_MODEL_ARCHIVE_MAP, + XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ] for key, value, in pretrained_map.items()) @@ -72,6 +74,7 @@ class AutoModel(object): - contains `distilbert`: DistilBertModel (DistilBERT model) - contains `albert`: AlbertModel (ALBERT model) - contains `camembert`: CamembertModel (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model) - contains `roberta`: RobertaModel (RoBERTa model) - contains `bert`: BertModel (Bert model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) @@ -98,6 +101,7 @@ class AutoModel(object): - contains `distilbert`: DistilBertModel (DistilBERT model) - contains `albert`: AlbertModel (ALBERT model) - contains `camembert`: CamembertModel (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model) - contains `roberta`: RobertaModel (RoBERTa model) - contains `bert`: BertModel (Bert model) - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model) @@ -175,6 +179,8 @@ class AutoModel(object): return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'camembert' in pretrained_model_name_or_path: return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -193,7 +199,7 @@ class AutoModel(object): return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) class AutoModelWithLMHead(object): @@ -212,6 +218,7 @@ class AutoModelWithLMHead(object): - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `albert`: AlbertForMaskedLM (ALBERT model) - contains `camembert`: CamembertForMaskedLM (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model) - contains `bert`: BertForMaskedLM (Bert model) - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model) @@ -241,6 +248,7 @@ class AutoModelWithLMHead(object): - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model) - contains `albert`: AlbertForMaskedLM (ALBERT model) - contains `camembert`: CamembertForMaskedLM (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model) - contains `roberta`: RobertaForMaskedLM (RoBERTa model) - contains `bert`: BertForMaskedLM (Bert model) - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model) @@ -317,6 +325,8 @@ class AutoModelWithLMHead(object): return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'camembert' in pretrained_model_name_or_path: return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -335,7 +345,7 @@ class AutoModelWithLMHead(object): return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) class AutoModelForSequenceClassification(object): @@ -353,6 +363,7 @@ class AutoModelForSequenceClassification(object): - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model) - contains `albert`: AlbertForSequenceClassification (ALBERT model) - contains `camembert`: CamembertForSequenceClassification (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model) - contains `roberta`: RobertaForSequenceClassification (RoBERTa model) - contains `bert`: BertForSequenceClassification (Bert model) - contains `xlnet`: XLNetForSequenceClassification (XLNet model) @@ -377,6 +388,7 @@ class AutoModelForSequenceClassification(object): - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model) - contains `albert`: AlbertForSequenceClassification (ALBERT model) - contains `camembert`: CamembertForSequenceClassification (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model) - contains `roberta`: RobertaForSequenceClassification (RoBERTa model) - contains `bert`: BertForSequenceClassification (Bert model) - contains `xlnet`: XLNetForSequenceClassification (XLNet model) @@ -448,6 +460,8 @@ class AutoModelForSequenceClassification(object): return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'camembert' in pretrained_model_name_or_path: return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -458,7 +472,7 @@ class AutoModelForSequenceClassification(object): return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) + "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path)) class AutoModelForQuestionAnswering(object): From 64a971a9156788ed6d95f850453578ecb74069c5 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 18:24:32 +0100 Subject: [PATCH 213/302] auto: add XLM-RoBERTa to auto tokenization --- transformers/tokenization_auto.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py index 173dee0e2b..5377bd48cb 100644 --- a/transformers/tokenization_auto.py +++ b/transformers/tokenization_auto.py @@ -31,6 +31,7 @@ from .tokenization_distilbert import DistilBertTokenizer from .tokenization_camembert import CamembertTokenizer from .tokenization_albert import AlbertTokenizer from .tokenization_t5 import T5Tokenizer +from .tokenization_xlm_roberta import XLMRobertaTokenizer logger = logging.getLogger(__name__) @@ -49,6 +50,7 @@ class AutoTokenizer(object): - contains `distilbert`: DistilBertTokenizer (DistilBert model) - contains `albert`: AlbertTokenizer (ALBERT model) - contains `camembert`: CamembertTokenizer (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `bert`: BertTokenizer (Bert model) - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model) @@ -75,6 +77,7 @@ class AutoTokenizer(object): - contains `distilbert`: DistilBertTokenizer (DistilBert model) - contains `albert`: AlbertTokenizer (ALBERT model) - contains `camembert`: CamembertTokenizer (CamemBERT model) + - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model) - contains `roberta`: RobertaTokenizer (RoBERTa model) - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model) - contains `bert`: BertTokenizer (Bert model) @@ -130,6 +133,8 @@ class AutoTokenizer(object): return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'camembert' in pretrained_model_name_or_path: return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) elif 'bert-base-japanese' in pretrained_model_name_or_path: @@ -150,4 +155,4 @@ class AutoTokenizer(object): return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', " - "'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) + "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path)) From 04b602f96f91529a1259909466705f3f9192113c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 18 Dec 2019 18:28:39 +0100 Subject: [PATCH 214/302] Put module import on top of the module. --- transformers/pipelines.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index a10078b027..92c94268a7 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -20,6 +20,7 @@ import os from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import groupby +from os.path import abspath, exists from typing import Union, Optional, Tuple, List, Dict import numpy as np @@ -100,12 +101,12 @@ class PipelineDataFormat: if self.is_multi_columns: self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] - from os.path import abspath, exists - if exists(abspath(self.output)): - raise OSError('{} already exists on disk'.format(self.output)) + if output is not None: + if exists(abspath(self.output)): + raise OSError('{} already exists on disk'.format(self.output)) - if not exists(abspath(self.path)): - raise OSError('{} doesnt exist on disk'.format(self.path)) + if not exists(abspath(self.path)): + raise OSError('{} doesnt exist on disk'.format(self.path)) @abstractmethod def __iter__(self): From e778dd854dd5d1fd29396d214577ddbe0f854247 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:27:34 +0100 Subject: [PATCH 215/302] modeling: add XLM-RoBERTa base model --- transformers/modeling_xlm_roberta.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py index 4c833c69ff..abace25d5b 100644 --- a/transformers/modeling_xlm_roberta.py +++ b/transformers/modeling_xlm_roberta.py @@ -27,6 +27,7 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { + 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-pytorch_model.bin", 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin", } From 128cfdee9bdd13f2f0bbef977907ff014ae398ad Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:28:16 +0100 Subject: [PATCH 216/302] tokenization add XLM-RoBERTa base model --- transformers/tokenization_xlm_roberta.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index d8484e7f9c..93b1c397e4 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -30,11 +30,13 @@ VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { + 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-sentencepiece.bpe.model", 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + 'xlm-roberta-base': None, 'xlm-roberta-large': None, } From 3e89fca54359bff728782cce4157118caed33c09 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:44:23 +0100 Subject: [PATCH 217/302] readme: add XLM-RoBERTa to model architecture list --- README.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index c33a65bdbb..a9d0fb3ace 100644 --- a/README.md +++ b/README.md @@ -146,7 +146,8 @@ At some point in the future, you'll be able to seamlessly move from pre-training 10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot. 11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. 12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu. -13. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. +13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +14. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR. These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html). @@ -168,7 +169,7 @@ import torch from transformers import * # Transformers has a unified API -# for 8 transformer architectures and 30 pretrained weights. +# for 10 transformer architectures and 30 pretrained weights. # Model | Tokenizer | Pretrained weights shortcut MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), (OpenAIGPTModel, OpenAIGPTTokenizer, 'openai-gpt'), @@ -178,7 +179,9 @@ MODELS = [(BertModel, BertTokenizer, 'bert-base-uncased'), (XLNetModel, XLNetTokenizer, 'xlnet-base-cased'), (XLMModel, XLMTokenizer, 'xlm-mlm-enfr-1024'), (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'), - (RobertaModel, RobertaTokenizer, 'roberta-base')] + (RobertaModel, RobertaTokenizer, 'roberta-base'), + (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'), + ] # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel` From d35405b7a32eadf4fb1200249b2bbc4c12fb0340 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:45:10 +0100 Subject: [PATCH 218/302] docs: add XLM-RoBERTa to index page --- docs/source/index.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/index.rst b/docs/source/index.rst index 48282c1c6c..cb34c5c7f0 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -50,6 +50,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train 9. `CTRL `_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation `_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 10. `CamemBERT `_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model `_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot. 11. `ALBERT `_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations `_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. +13. `XLM-RoBERTa `_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. .. toctree:: :maxdepth: 2 From dd7a958fd6963d09850ad4842307d1d1064d096d Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:45:46 +0100 Subject: [PATCH 219/302] docs: add XLM-RoBERTa to pretrained model list (incl. all parameters) --- docs/source/pretrained_models.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index 7d037da34f..a359990f5a 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -240,6 +240,12 @@ Here is the full list of the currently provided pretrained models together with | | ``t5-11B`` | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads, | | | | | Trained on English text: the Colossal Clean Crawled Corpus (C4) | +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| XLM-RoBERTa | ``xlm-roberta-base`` | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads, | +| | | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``xlm-roberta-large`` | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads, | +| | | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages | ++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ .. `__ From f09d9996413f2b265f1c672d7a4b438e4c5099c4 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 19:49:33 +0100 Subject: [PATCH 220/302] =?UTF-8?q?docs:=20fix=20numbering=20=F0=9F=98=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/source/index.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index cb34c5c7f0..0ac9c740a5 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -50,7 +50,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train 9. `CTRL `_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation `_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher. 10. `CamemBERT `_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model `_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot. 11. `ALBERT `_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations `_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut. -13. `XLM-RoBERTa `_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. +12. `XLM-RoBERTa `_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale `_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. .. toctree:: :maxdepth: 2 From 8bb166db5de426ecfb2c3bf1160307324419bbda Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 18 Dec 2019 22:53:19 +0100 Subject: [PATCH 221/302] Expose more information in the output of TextClassificationPipeline --- transformers/pipelines.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 92c94268a7..809096a30c 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -283,7 +283,9 @@ class TextClassificationPipeline(Pipeline): self._nb_classes = nb_classes def __call__(self, *args, **kwargs): - return super().__call__(*args, **kwargs).tolist() + outputs = super().__call__(*args, **kwargs) + scores = np.exp(outputs) / np.exp(outputs).sum(-1) + return [{'label': self.model.config.id2label[item.argmax()], 'score': item.max()} for item in scores] class NerPipeline(Pipeline): From 7711403bbdefad62e7ee88a88e04ec08b53412bc Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 18 Dec 2019 22:59:51 +0100 Subject: [PATCH 222/302] Expose config through the cli arguments --- transformers/commands/run.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 8c203699a8..44c1127803 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -16,7 +16,7 @@ def try_infer_format_from_ext(path: str): def run_command_factory(args): - nlp = pipeline(task=args.task, model=args.model, tokenizer=args.tokenizer, device=args.device) + nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device) format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column) return RunCommand(nlp, reader) @@ -34,6 +34,7 @@ class RunCommand(BaseTransformersCLICommand): run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') + run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.') run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') run_parser.add_argument('--column', type=str, required=True, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') From d0724d0794e930eb5821d4289130beade3359e87 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Wed, 18 Dec 2019 23:27:26 +0100 Subject: [PATCH 223/302] Add PipedPipelineDataFormat --- transformers/__init__.py | 4 +- transformers/commands/run.py | 7 +- transformers/pipelines.py | 154 +++++++++++++++-------------------- 3 files changed, 70 insertions(+), 95 deletions(-) diff --git a/transformers/__init__.py b/transformers/__init__.py index c474696062..87ede9f6a8 100755 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -191,8 +191,8 @@ from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name load_tf2_model_in_pytorch_model) # Pipelines -# from .pipeline_ import TextClassificationPipeline -from .pipelines import Pipeline, pipeline, TextClassificationPipeline +from .pipelines import pipeline, PipelineDataFormat, CsvPipelineDataFormat, JsonPipelineDataFormat, PipedPipelineDataFormat, \ + Pipeline, FeatureExtractionPipeline, QuestionAnsweringPipeline, NerPipeline, TextClassificationPipeline if not is_tf_available() and not is_torch_available(): logger.warning("Neither PyTorch nor TensorFlow >= 2.0 have been found." diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 44c1127803..7c00c0057f 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -36,11 +36,10 @@ class RunCommand(BaseTransformersCLICommand): run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.') run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') - run_parser.add_argument('--column', type=str, required=True, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') + run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') - run_parser.add_argument('--input', type=str, required=True, help='Path to the file to use for inference') - run_parser.add_argument('--output', type=str, required=True, help='Path to the file that will be used post to write results.') - run_parser.add_argument('kwargs', nargs='*', help='Arguments to forward to the file format reader') + run_parser.add_argument('--input', type=str, help='Path to the file to use for inference') + run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.') run_parser.set_defaults(func=run_command_factory) def run(self): diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 809096a30c..9e7051b70a 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -66,20 +66,6 @@ class DefaultArgumentHandler(ArgumentHandler): raise ValueError('Unable to infer the format of the provided data (X=, data=, ...)') -class _ScikitCompat(ABC): - """ - Interface layer for the Scikit and Keras compatibility. - """ - - @abstractmethod - def transform(self, X): - raise NotImplementedError() - - @abstractmethod - def predict(self, X): - raise NotImplementedError() - - class PipelineDataFormat: """ Base class for all the pipeline supported data format both for reading and writing. @@ -90,12 +76,12 @@ class PipelineDataFormat: PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. """ - SUPPORTED_FORMATS = ['json', 'csv'] + SUPPORTED_FORMATS = ['json', 'csv', 'pipe'] - def __init__(self, output: str, path: str, column: str): + def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): self.output = output self.path = path - self.column = column.split(',') + self.column = column.split(',') if column else [''] self.is_multi_columns = len(self.column) > 1 if self.is_multi_columns: @@ -117,17 +103,19 @@ class PipelineDataFormat: raise NotImplementedError() @staticmethod - def from_str(name: str, output: str, path: str, column: str): + def from_str(name: str, output: Optional[str], path: Optional[str], column: Optional[str]): if name == 'json': return JsonPipelineDataFormat(output, path, column) elif name == 'csv': return CsvPipelineDataFormat(output, path, column) + elif name == 'pipe': + return PipedPipelineDataFormat(output, path, column) else: - raise KeyError('Unknown reader {} (Available reader are json/csv)'.format(name)) + raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(name)) class CsvPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: str, path: str, column: str): + def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): super().__init__(output, path, column) def __iter__(self): @@ -148,7 +136,7 @@ class CsvPipelineDataFormat(PipelineDataFormat): class JsonPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: str, path: str, column: str): + def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): super().__init__(output, path, column) with open(path, 'r') as f: @@ -166,6 +154,50 @@ class JsonPipelineDataFormat(PipelineDataFormat): json.dump(data, f) +class PipedPipelineDataFormat(PipelineDataFormat): + """ + Read data from piped input to the python process. + For multi columns data, columns should separated by \t + + If columns are provided, then the output will be a dictionary with {column_x: value_x} + """ + def __iter__(self): + import sys + for line in sys.stdin: + + # Split for multi-columns + if '\t' in line: + + line = line.split('\t') + if self.column: + # Dictionary to map arguments + yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} + else: + yield tuple(line) + + # No dictionary to map arguments + else: + print(line) + yield line + + def save(self, data: dict): + print(data) + + +class _ScikitCompat(ABC): + """ + Interface layer for the Scikit and Keras compatibility. + """ + + @abstractmethod + def transform(self, X): + raise NotImplementedError() + + @abstractmethod + def predict(self, X): + raise NotImplementedError() + + class Pipeline(_ScikitCompat): """ Base class implementing pipelined operations. @@ -208,18 +240,6 @@ class Pipeline(_ScikitCompat): """ return self(X=X) - def __call__(self, *texts, **kwargs): - # Parse arguments - inputs = self._args_parser(*texts, **kwargs) - - # Encode for forward - with self.device_placement(): - inputs = self.tokenizer.batch_encode_plus( - inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' - ) - - return self._forward(inputs) - @contextmanager def device_placement(self): """ @@ -244,6 +264,18 @@ class Pipeline(_ScikitCompat): yield + def __call__(self, *texts, **kwargs): + # Parse arguments + inputs = self._args_parser(*texts, **kwargs) + + # Encode for forward + with self.device_placement(): + inputs = self.tokenizer.batch_encode_plus( + inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + ) + + return self._forward(inputs) + def _forward(self, inputs): """ Internal framework specific forward dispatching. @@ -275,12 +307,6 @@ class TextClassificationPipeline(Pipeline): """ Text classification pipeline using ModelForTextClassification head. """ - def __init__(self, model, tokenizer: PreTrainedTokenizer, nb_classes: int = 2): - super().__init__(model, tokenizer) - - if nb_classes < 2: - raise Exception('Invalid parameter nb_classes. int >= 2 is required (got: {})'.format(nb_classes)) - self._nb_classes = nb_classes def __call__(self, *args, **kwargs): outputs = super().__call__(*args, **kwargs) @@ -398,56 +424,6 @@ class QuestionAnsweringPipeline(Pipeline): Question Answering pipeline using ModelForQuestionAnswering head. """ - class QuestionAnsweringArgumentHandler(ArgumentHandler): - - def __call__(self, *args, **kwargs): - # Position args, handling is sensibly the same as X and data, so forwarding to avoid duplicating - if args is not None and len(args) > 0: - if len(args) == 1: - kwargs['X'] = args[0] - else: - kwargs['X'] = list(args) - - # Generic compatibility with sklearn and Keras - # Batched data - if 'X' in kwargs or 'data' in kwargs: - data = kwargs['X'] if 'X' in kwargs else kwargs['data'] - - if not isinstance(data, list): - data = [data] - - for i, item in enumerate(data): - if isinstance(item, dict): - if any(k not in item for k in ['question', 'context']): - raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') - data[i] = QuestionAnsweringPipeline.create_sample(**item) - - elif isinstance(item, SquadExample): - continue - else: - raise ValueError( - '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' - .format('X' if 'X' in kwargs else 'data') - ) - inputs = data - - # Tabular input - elif 'question' in kwargs and 'context' in kwargs: - if isinstance(kwargs['question'], str): - kwargs['question'] = [kwargs['question']] - - if isinstance(kwargs['context'], str): - kwargs['context'] = [kwargs['context']] - - inputs = [QuestionAnsweringPipeline.create_sample(q, c) for q, c in zip(kwargs['question'], kwargs['context'])] - else: - raise ValueError('Unknown arguments {}'.format(kwargs)) - - if not isinstance(inputs, list): - inputs = [inputs] - - return inputs - @staticmethod def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: """ From db90e1211433ef99f952e60fbe5ea578391b86b1 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 23:46:33 +0100 Subject: [PATCH 224/302] configuration: use S3 location for XLM-RoBERTa model --- transformers/configuration_xlm_roberta.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py index dd03572976..d7a26538c5 100644 --- a/transformers/configuration_xlm_roberta.py +++ b/transformers/configuration_xlm_roberta.py @@ -25,7 +25,8 @@ from .configuration_roberta import RobertaConfig logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { - 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json", + 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", + 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", } From 5c5f67a256b558b470f03cf36edc5ea35dec2fba Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 23:47:00 +0100 Subject: [PATCH 225/302] modeling: use S3 location for XLM-RoBERTa model --- transformers/modeling_xlm_roberta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py index abace25d5b..8095c46a16 100644 --- a/transformers/modeling_xlm_roberta.py +++ b/transformers/modeling_xlm_roberta.py @@ -27,8 +27,8 @@ from .file_utils import add_start_docstrings logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { - 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-pytorch_model.bin", - 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin", + 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin", + 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin", } From fe9aab1055604e772be05a1cbbb36a207c177055 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Wed, 18 Dec 2019 23:47:48 +0100 Subject: [PATCH 226/302] tokenization: use S3 location for XLM-RoBERTa model --- transformers/tokenization_xlm_roberta.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index 93b1c397e4..453c4375c6 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -30,8 +30,8 @@ VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { - 'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-sentencepiece.bpe.model", - 'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model", + 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", + 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", } } From ec5d6c6a70e0bcdf31e737206caa1e56e859b2d3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 00:12:10 +0100 Subject: [PATCH 227/302] Adressing issue with NER task omitting first and last word. --- transformers/pipelines.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 9e7051b70a..1d8f226b13 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -318,8 +318,6 @@ class NerPipeline(Pipeline): """ Named Entity Recognition pipeline using ModelForTokenClassification head. """ - def __init__(self, model, tokenizer: PreTrainedTokenizer): - super().__init__(model, tokenizer) def __call__(self, *texts, **kwargs): inputs, answers = self._args_parser(*texts, **kwargs), [] @@ -344,14 +342,16 @@ class NerPipeline(Pipeline): # Normalize scores answer, token_start = [], 1 - for idx, word in groupby(token_to_word[1:-1]): + for idx, word in groupby(token_to_word): # Sum log prob over token, then normalize across labels score = np.exp(entities[token_start]) / np.exp(entities[token_start]).sum(-1, keepdims=True) label_idx = score.argmax() answer += [{ - 'word': words[idx - 1], 'score': score[label_idx].item(), 'entity': self.model.config.id2label[label_idx] + 'word': words[idx], + 'score': score[label_idx].item(), + 'entity': self.model.config.id2label[label_idx] }] # Update token start From a26ce4dee116a1d5d9099c8a94e22d1e31ad631c Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 19 Dec 2019 02:23:01 +0100 Subject: [PATCH 228/302] examples: add XLM-RoBERTa to glue script --- examples/run_glue.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/examples/run_glue.py b/examples/run_glue.py index 1a51255c11..954a8fbf0c 100644 --- a/examples/run_glue.py +++ b/examples/run_glue.py @@ -52,6 +52,9 @@ from transformers import (WEIGHTS_NAME, BertConfig, AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer, + XLMRobertaConfig, + XLMRobertaForSequenceClassification, + XLMRobertaTokenizer, ) from transformers import AdamW, get_linear_schedule_with_warmup @@ -72,7 +75,8 @@ MODEL_CLASSES = { 'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer), 'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer), 'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer), - 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer) + 'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer), + 'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer), } @@ -304,9 +308,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False): else: logger.info("Creating features from dataset file at %s", args.data_dir) label_list = processor.get_labels() - if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']: + if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']: # HACK(label indices are swapped in RoBERTa pretrained model) - label_list[1], label_list[2] = label_list[2], label_list[1] + label_list[1], label_list[2] = label_list[2], label_list[1] examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir) features = convert_examples_to_features(examples, tokenizer, From bcc99fd92efe03ff332e5b26a342a09e9d709cf7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 10:32:21 +0100 Subject: [PATCH 229/302] Fix wrong automatic config allocation through AutoConfig --- transformers/pipelines.py | 117 +++++++++++++++++++++++++------------- 1 file changed, 78 insertions(+), 39 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 1d8f226b13..ee3aed2c65 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -25,7 +25,7 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np -from transformers import AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ +from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger if is_tf_available(): @@ -264,6 +264,27 @@ class Pipeline(_ScikitCompat): yield + def inputs_for_model(self, features: Union[dict, List[dict]]) -> Dict: + """ + Generates the input dictionary with model-specific parameters. + + Returns: + dict holding all the required parameters for model's forward + """ + args = ['input_ids', 'attention_mask'] + model_type = type(self.model).__name__.lower() + + if 'distilbert' not in model_type and 'xlm' not in model_type: + args += ['token_type_ids'] + + if 'xlnet' in model_type or 'xlm' in model_type: + args += ['cls_index', 'p_mask'] + + if isinstance(features, dict): + return {k: features[k] for k in args} + else: + return {k: [feature[k] for feature in features] for k in args} + def __call__(self, *texts, **kwargs): # Parse arguments inputs = self._args_parser(*texts, **kwargs) @@ -271,9 +292,14 @@ class Pipeline(_ScikitCompat): # Encode for forward with self.device_placement(): inputs = self.tokenizer.batch_encode_plus( - inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt' + inputs, add_special_tokens=True, + return_tensors='tf' if is_tf_available() else 'pt', + # max_length=self.model.config.max_position_embedding + max_length=511 ) + # Filter out features not available on specific models + inputs = self.inputs_for_model(inputs) return self._forward(inputs) def _forward(self, inputs): @@ -331,7 +357,11 @@ class NerPipeline(Pipeline): # Manage correct placement of the tensors with self.device_placement(): - tokens = self.tokenizer.encode_plus(sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt') + tokens = self.tokenizer.encode_plus( + sentence, return_attention_mask=False, + return_tensors='tf' if is_tf_available() else 'pt', + max_length=512 + ) # Forward if is_torch_available(): @@ -443,27 +473,6 @@ class QuestionAnsweringPipeline(Pipeline): super().__init__(model, tokenizer, args_parser=QuestionAnsweringArgumentHandler(), device=device, **kwargs) - def inputs_for_model(self, features: Union[SquadExample, List[SquadExample]]) -> Dict: - """ - Generates the input dictionary with model-specific parameters. - - Returns: - dict holding all the required parameters for model's forward - """ - args = ['input_ids', 'attention_mask'] - model_type = type(self.model).__name__.lower() - - if 'distilbert' not in model_type and 'xlm' not in model_type: - args += ['token_type_ids'] - - if 'xlnet' in model_type or 'xlm' in model_type: - args += ['cls_index', 'p_mask'] - - if isinstance(features, SquadExample): - return {k: features.__dict__[k] for k in args} - else: - return {k: [feature.__dict__[k] for feature in features] for k in args} - def __call__(self, *texts, **kwargs): """ Args: @@ -495,7 +504,7 @@ class QuestionAnsweringPipeline(Pipeline): # Convert inputs to features examples = self._args_parser(*texts, **kwargs) features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) - fw_args = self.inputs_for_model(features) + fw_args = self.inputs_for_model(features.__dict__) # Manage tensor allocation on correct device with self.device_placement(): @@ -627,29 +636,50 @@ class QuestionAnsweringPipeline(Pipeline): # Register all the supported task here SUPPORTED_TASKS = { 'feature-extraction': { - 'impl': FeatureExtractionPipeline, - 'tf': TFAutoModel if is_tf_available() else None, - 'pt': AutoModel if is_torch_available() else None, + 'impl': FeatureExtractionPipeline, + 'tf': TFAutoModel if is_tf_available() else None, + 'pt': AutoModel if is_torch_available() else None, + 'default': { + 'model': 'distilbert-base-uncased', + 'config': None, + 'tokenizer': 'bert-base-uncased' + } }, - 'text-classification': { + 'sentiment-analysis': { 'impl': TextClassificationPipeline, 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, - 'pt': AutoModelForSequenceClassification if is_torch_available() else None + 'pt': AutoModelForSequenceClassification if is_torch_available() else None, + 'default': { + 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', + 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json', + 'tokenizer': 'bert-base-uncased' + } }, 'ner': { - 'impl': NerPipeline, - 'tf': TFAutoModelForTokenClassification if is_tf_available() else None, - 'pt': AutoModelForTokenClassification if is_torch_available() else None, + 'impl': NerPipeline, + 'tf': TFAutoModelForTokenClassification if is_tf_available() else None, + 'pt': AutoModelForTokenClassification if is_torch_available() else None, + 'default': { + 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', + 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json', + 'tokenizer': 'bert-base-cased' + } }, 'question-answering': { 'impl': QuestionAnsweringPipeline, 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, - 'pt': AutoModelForQuestionAnswering if is_torch_available() else None + 'pt': AutoModelForQuestionAnswering if is_torch_available() else None, + 'default': { + 'model': 'distilbert-base-uncased-distilled-squad', + 'config': None, + 'tokenizer': 'bert-base-uncased' + } } } -def pipeline(task: str, model, config: Optional[Union[str, PretrainedConfig]] = None, +def pipeline(task: str, model: Optional = None, + config: Optional[Union[str, PretrainedConfig]] = None, tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: """ Utility factory method to build a pipeline. @@ -657,23 +687,32 @@ def pipeline(task: str, model, config: Optional[Union[str, PretrainedConfig]] = A Tokenizer instance in charge of mapping raw textual input to token A Model instance Some (optional) post processing for enhancing model's output + + Examples: + pipeline('ner') """ # Try to infer tokenizer from model name (if provided as str) if tokenizer is None: - if not isinstance(model, str): + if model is not None and not isinstance(model, str): # Impossible to guest what is the right tokenizer here raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') else: tokenizer = model - tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) - + # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) targeted_task = SUPPORTED_TASKS[task] task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + # Handling for default model for the task + if model is None: + model, config, tokenizer = tuple(targeted_task['default'].values()) + + # Allocate tokenizer + tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) + # Special handling for model conversion if isinstance(model, str): from_tf = model.endswith('.h5') and not is_tf_available() @@ -689,7 +728,7 @@ def pipeline(task: str, model, config: Optional[Union[str, PretrainedConfig]] = from_tf = from_pt = False if isinstance(config, str): - config = PretrainedConfig.from_pretrained(config) + config = AutoConfig.from_pretrained(config) if allocator.__name__.startswith('TF'): model = allocator.from_pretrained(model, config=config, from_pt=from_pt) From d72fa2a0f67f9c55877826f767f01007507fd8e3 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 10:54:10 +0100 Subject: [PATCH 230/302] Fix inputs_for_model call in QuestionAnsweringPipeline accessing __dict__ on list. --- transformers/pipelines.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index ee3aed2c65..a220aa7e71 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -504,7 +504,7 @@ class QuestionAnsweringPipeline(Pipeline): # Convert inputs to features examples = self._args_parser(*texts, **kwargs) features = squad_convert_examples_to_features(examples, self.tokenizer, kwargs['max_seq_len'], kwargs['doc_stride'], kwargs['max_question_len'], False) - fw_args = self.inputs_for_model(features.__dict__) + fw_args = self.inputs_for_model([f.__dict__ for f in features]) # Manage tensor allocation on correct device with self.device_placement(): From f516cf39564bd33bd26e6a51d6cf9f589e600078 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 11:42:33 +0100 Subject: [PATCH 231/302] Allow pipeline to write output in binary format --- transformers/commands/run.py | 10 +++++++++- transformers/pipelines.py | 32 ++++++++++++++++++++++++++++++-- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 7c00c0057f..78109b2a16 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -1,9 +1,13 @@ +import logging from argparse import ArgumentParser from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import pipeline, Pipeline, PipelineDataFormat, SUPPORTED_TASKS +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + + def try_infer_format_from_ext(path: str): for ext in PipelineDataFormat.SUPPORTED_FORMATS: if path.endswith(ext): @@ -51,7 +55,11 @@ class RunCommand(BaseTransformersCLICommand): output += [nlp(entry)] # Saving data - self._reader.save(output) + if self._nlp.binary_output: + binary_path = self._reader.save_binary(output) + logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path)) + else: + self._reader.save(output) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index a220aa7e71..b766549121 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -17,6 +17,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera import csv import json import os +import pickle from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import groupby @@ -91,6 +92,7 @@ class PipelineDataFormat: if exists(abspath(self.output)): raise OSError('{} already exists on disk'.format(self.output)) + if path is not None: if not exists(abspath(self.path)): raise OSError('{} doesnt exist on disk'.format(self.path)) @@ -102,6 +104,15 @@ class PipelineDataFormat: def save(self, data: dict): raise NotImplementedError() + def save_binary(self, data: Union[dict, List[dict]]) -> str: + path, _ = os.path.splitext(self.output) + binary_path = os.path.extsep.join((path, 'pickle')) + + with open(binary_path, 'wb+') as f_output: + pickle.dump(data, f_output) + + return binary_path + @staticmethod def from_str(name: str, output: Optional[str], path: Optional[str], column: Optional[str]): if name == 'json': @@ -177,12 +188,20 @@ class PipedPipelineDataFormat(PipelineDataFormat): # No dictionary to map arguments else: - print(line) yield line def save(self, data: dict): print(data) + def save_binary(self, data: Union[dict, List[dict]]) -> str: + if self.output is None: + raise KeyError( + 'When using piped input on pipeline outputting large object requires an output file path. ' + 'Please provide such output path through --output argument.' + ) + + return super().save_binary(data) + class _ScikitCompat(ABC): """ @@ -205,11 +224,13 @@ class Pipeline(_ScikitCompat): Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, - args_parser: ArgumentHandler = None, device: int = -1, **kwargs): + args_parser: ArgumentHandler = None, device: int = -1, + binary_output: bool = False): self.model = model self.tokenizer = tokenizer self.device = device + self.binary_output = binary_output self._args_parser = args_parser or DefaultArgumentHandler() # Special handling @@ -325,6 +346,13 @@ class FeatureExtractionPipeline(Pipeline): """ Feature extraction pipeline using Model head. """ + + def __init__(self, model, + tokenizer: PreTrainedTokenizer = None, + args_parser: ArgumentHandler = None, + device: int = -1): + super().__init__(model, tokenizer, args_parser, device, binary_output=True) + def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() From fc624716aad05efa78e4d65f214fb978ea0ac9e7 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 11:49:06 +0100 Subject: [PATCH 232/302] Renaming framework env variables flags from NO_ to USE_ --- transformers/file_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 4784681fb4..c586c57a76 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -27,8 +27,9 @@ from contextlib import contextmanager logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: - if 'NO_TF' in os.environ and os.environ['NO_TF'].upper() in ('1', 'ON'): - logger.info("Found NO_TF, disabling TensorFlow") + os.environ.setdefault('USE_TF', 'YES') + if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'): + logger.info("USE_TF override through env variable, disabling Tensorflow") _tf_available = False else: import tensorflow as tf @@ -39,8 +40,9 @@ except (ImportError, AssertionError): _tf_available = False # pylint: disable=invalid-name try: - if 'NO_TORCH' in os.environ and os.environ['NO_TORCH'].upper() in ('1', 'ON'): - logger.info("Found NO_TORCH, disabling PyTorch") + os.environ.setdefault('USE_TORCH', 'YES') + if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'): + logger.info("USE_TORCH override through env variable, disabling PyTorch") _torch_available = False else: import torch From 3b29322d4c197fec46ae48f62aa2870d00d0852a Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 12:24:17 +0100 Subject: [PATCH 233/302] Expose all the pipeline argument on serve command. --- transformers/commands/serving.py | 38 +++++++++++++------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py index a35dff0ebe..a7321470ce 100644 --- a/transformers/commands/serving.py +++ b/transformers/commands/serving.py @@ -17,25 +17,18 @@ def serve_command_factory(args: Namespace): Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ - nlp = pipeline(args.task, args.model) - return ServeCommand(nlp, args.host, args.port, args.model, args.graphql) + nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device) + return ServeCommand(nlp, args.host, args.port) -class ServeResult(BaseModel): - """ - Base class for serving result - """ - model: str - - -class ServeModelInfoResult(ServeResult): +class ServeModelInfoResult(BaseModel): """ Expose model information """ infos: dict -class ServeTokenizeResult(ServeResult): +class ServeTokenizeResult(BaseModel): """ Tokenize result model """ @@ -43,14 +36,14 @@ class ServeTokenizeResult(ServeResult): tokens_ids: Optional[List[int]] -class ServeDeTokenizeResult(ServeResult): +class ServeDeTokenizeResult(BaseModel): """ DeTokenize result model """ text: str -class ServeForwardResult(ServeResult): +class ServeForwardResult(BaseModel): """ Forward result model """ @@ -71,11 +64,12 @@ class ServeCommand(BaseTransformersCLICommand): serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.') serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.') - serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model to infer from.') - serve_parser.add_argument('--graphql', action='store_true', default=False, help='Enable GraphQL endpoints.') + serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model.') + serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.') + serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.') serve_parser.set_defaults(func=serve_command_factory) - def __init__(self, pipeline: Pipeline, host: str, port: int, model: str, graphql: bool): + def __init__(self, pipeline: Pipeline, host: str, port: int): self._logger = getLogger('transformers-cli/serving') self._pipeline = pipeline @@ -95,7 +89,7 @@ class ServeCommand(BaseTransformersCLICommand): run(self._app, host=self._host, port=self._port) def model_info(self): - return ServeModelInfoResult(model='', infos=vars(self._pipeline.model.config)) + return ServeModelInfoResult(infos=vars(self._pipeline.model.config)) def tokenize(self, text_input: str = Body(None, embed=True), return_ids: bool = Body(False, embed=True)): """ @@ -108,9 +102,9 @@ class ServeCommand(BaseTransformersCLICommand): if return_ids: tokens_ids = self._pipeline.tokenizer.convert_tokens_to_ids(tokens_txt) - return ServeTokenizeResult(model='', tokens=tokens_txt, tokens_ids=tokens_ids) + return ServeTokenizeResult(tokens=tokens_txt, tokens_ids=tokens_ids) else: - return ServeTokenizeResult(model='', tokens=tokens_txt) + return ServeTokenizeResult(tokens=tokens_txt) except Exception as e: raise HTTPException(status_code=500, detail={"model": '', "error": str(e)}) @@ -139,13 +133,11 @@ class ServeCommand(BaseTransformersCLICommand): # Check we don't have empty string if len(inputs) == 0: - return ServeForwardResult(model='', output=[], attention=[]) + return ServeForwardResult(output=[], attention=[]) try: # Forward through the model output = self._pipeline(inputs) - return ServeForwardResult( - model='', output=output - ) + return ServeForwardResult(output=output) except Exception as e: raise HTTPException(500, {"error": str(e)}) From 5664327c24730b30b52d84e71e9af7c1d32b0fe1 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 12:27:54 +0100 Subject: [PATCH 234/302] Hide train command for now. --- transformers-cli | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformers-cli b/transformers-cli index 39b7f5816b..db2bd0e2a3 100755 --- a/transformers-cli +++ b/transformers-cli @@ -5,7 +5,6 @@ from transformers.commands.download import DownloadCommand from transformers.commands.run import RunCommand from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands -from transformers.commands.train import TrainCommand from transformers.commands.convert import ConvertCommand if __name__ == '__main__': @@ -17,7 +16,6 @@ if __name__ == '__main__': DownloadCommand.register_subcommand(commands_parser) RunCommand.register_subcommand(commands_parser) ServeCommand.register_subcommand(commands_parser) - TrainCommand.register_subcommand(commands_parser) UserCommands.register_subcommand(commands_parser) # Let's go From faef6f6191a8d319b541396a0f850c3d6f15f5d4 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 12:28:17 +0100 Subject: [PATCH 235/302] Fix logic order for USE_TF/USE_TORCH --- transformers/file_utils.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index c586c57a76..92cada85b3 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -29,25 +29,27 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: os.environ.setdefault('USE_TF', 'YES') if os.environ['USE_TF'].upper() in ('1', 'ON', 'YES'): - logger.info("USE_TF override through env variable, disabling Tensorflow") - _tf_available = False - else: import tensorflow as tf assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2 _tf_available = True # pylint: disable=invalid-name logger.info("TensorFlow version {} available.".format(tf.__version__)) + else: + logger.info("USE_TF override through env variable, disabling Tensorflow") + _tf_available = False + except (ImportError, AssertionError): _tf_available = False # pylint: disable=invalid-name try: os.environ.setdefault('USE_TORCH', 'YES') if os.environ['USE_TORCH'].upper() in ('1', 'ON', 'YES'): - logger.info("USE_TORCH override through env variable, disabling PyTorch") - _torch_available = False - else: import torch _torch_available = True # pylint: disable=invalid-name logger.info("PyTorch version {} available.".format(torch.__version__)) + + else: + logger.info("USE_TORCH override through env variable, disabling PyTorch") + _torch_available = False except ImportError: _torch_available = False # pylint: disable=invalid-name From 81a911cce5d659ef66eddff288489f25fc195f16 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 15:12:06 +0100 Subject: [PATCH 236/302] Doc, doc, ... doc. --- transformers/pipelines.py | 62 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 4 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index b766549121..71e6d0fbed 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -102,9 +102,19 @@ class PipelineDataFormat: @abstractmethod def save(self, data: dict): + """ + Save the provided data object with the representation for the current `DataFormat`. + :param data: data to store + :return: + """ raise NotImplementedError() def save_binary(self, data: Union[dict, List[dict]]) -> str: + """ + Save the provided data object as a pickle-formatted binary data on the disk. + :param data: data to store + :return: (str) Path where the data has been saved + """ path, _ = os.path.splitext(self.output) binary_path = os.path.extsep.join((path, 'pickle')) @@ -222,6 +232,42 @@ class Pipeline(_ScikitCompat): Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following operations: Input -> Tokenization -> Model Inference -> Post-Processing (Task dependent) -> Output + + Pipeline supports running on CPU or GPU through the device argument. Users can specify + device argument as an integer, -1 meaning "CPU", >= 0 referring the CUDA device ordinal. + + Some pipeline, like for instance FeatureExtractionPipeline ('feature-extraction') outputs large + tensor object as nested-lists. In order to avoid dumping such large structure as textual data we + provide the binary_output constructor argument. If set to True, the output will be stored in the + pickle format. + + Arguments: + **model**: ``(str, PretrainedModel, TFPretrainedModel)``: + Reference to the model to use through this pipeline. + + **tokenizer**: ``(str, PreTrainedTokenizer)``: + Reference to the tokenizer to use through this pipeline. + + **args_parser**: ``ArgumentHandler``: + Reference to the object in charge of parsing supplied pipeline parameters. + + **device**: ``int``: + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, >=0 will run the model + on the associated CUDA device id. + + **binary_output** ``bool`` (default: False): + Flag indicating if the output the pipeline should happen in a binary format (i.e. pickle) or as raw text. + + Return: + Pipeline returns list or dictionary depending on: + - Does the user provided multiple sample + - The pipeline expose multiple fields in the output object + + Examples: + nlp = pipeline('ner') + nlp = pipeline('ner', model='...', config='...', tokenizer='...') + nlp = NerPipeline(model='...', config='...', tokenizer='...') + nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...') """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, args_parser: ArgumentHandler = None, device: int = -1, @@ -312,11 +358,11 @@ class Pipeline(_ScikitCompat): # Encode for forward with self.device_placement(): + # TODO : Remove this 512 hard-limit inputs = self.tokenizer.batch_encode_plus( inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt', - # max_length=self.model.config.max_position_embedding - max_length=511 + max_length=512 ) # Filter out features not available on specific models @@ -385,6 +431,8 @@ class NerPipeline(Pipeline): # Manage correct placement of the tensors with self.device_placement(): + + # TODO : Remove this 512 hard-limit tokens = self.tokenizer.encode_plus( sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt', @@ -488,9 +536,12 @@ class QuestionAnsweringPipeline(Pipeline): QuestionAnsweringPipeline leverages the SquadExample/SquadFeatures internally. This helper method encapsulate all the logic for converting question(s) and context(s) to SquadExample(s). We currently support extractive question answering. - Args: + Arguments: question: (str, List[str]) The question to be ask for the associated context context: (str, List[str]) The context in which we will look for the answer. + + Returns: + SquadExample initialized with the corresponding question and context. """ if isinstance(question, list): return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] @@ -717,7 +768,10 @@ def pipeline(task: str, model: Optional = None, Some (optional) post processing for enhancing model's output Examples: - pipeline('ner') + pipeline('sentiment-analysis') + pipeline('question-answering', model='distilbert-base-uncased-distilled-squad', tokenizer='bert-base-cased') + pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...) + pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased') """ # Try to infer tokenizer from model name (if provided as str) if tokenizer is None: From ed6ba93912d223886fe0b88dd4ee58b20774beaf Mon Sep 17 00:00:00 2001 From: patrickvonplaten Date: Thu, 19 Dec 2019 01:26:01 +0100 Subject: [PATCH 237/302] corrected typo in example for t5 model input argument --- transformers/modeling_t5.py | 4 ++-- transformers/modeling_tf_t5.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py index 263dc33b70..9baf69d02b 100644 --- a/transformers/modeling_t5.py +++ b/transformers/modeling_t5.py @@ -693,7 +693,7 @@ class T5Model(T5PreTrainedModel): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5Model.from_pretrained('t5-small') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids) + outputs = model(input_ids=input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ @@ -798,7 +798,7 @@ class T5WithLMHeadModel(T5PreTrainedModel): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = T5WithLMHeadModel.from_pretrained('t5-small') input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0) # Batch size 1 - outputs = model(input_ids, lm_labels=input_ids) + outputs = model(input_ids=input_ids, lm_labels=input_ids) loss, prediction_scores = outputs[:2] """ diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py index 1336a1c30d..e803e00c8d 100644 --- a/transformers/modeling_tf_t5.py +++ b/transformers/modeling_tf_t5.py @@ -610,7 +610,7 @@ class TFT5Model(TFT5PreTrainedModel): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5Model.from_pretrained('t5-small') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) + outputs = model(input_ids=input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple """ @@ -701,7 +701,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel): tokenizer = T5Tokenizer.from_pretrained('t5-small') model = TFT5WithLMHeadModel.from_pretrained('t5-small') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 - outputs = model(input_ids) + outputs = model(input_ids=input_ids) prediction_scores = outputs[0] """ From 284572efc05a6a8d9e351e886ea3cab0f5f2367a Mon Sep 17 00:00:00 2001 From: Ejar Date: Wed, 18 Dec 2019 17:47:47 +0100 Subject: [PATCH 238/302] Updated typo on the link Updated documentation due to typo --- examples/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/README.md b/examples/README.md index b6b3908810..fcd2fe1f6f 100644 --- a/examples/README.md +++ b/examples/README.md @@ -467,7 +467,7 @@ Training with the previously defined hyper-parameters yields the following resul ## Named Entity Recognition Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and -[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2. +[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2. This example fine-tune Bert Multilingual on GermEval 2014 (German NER). Details and results for the fine-tuning provided by @stefan-it. From 62c1fc3c1ecdfab787ee3c34d1ec1eba65c18877 Mon Sep 17 00:00:00 2001 From: Francesco Date: Thu, 19 Dec 2019 14:43:10 +0100 Subject: [PATCH 239/302] Removed duplicate XLMConfig, XLMForQuestionAnswering and XLMTokenizer from import statement of run_squad.py script --- examples/run_squad.py | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 34c31c3bb8..1ff6983f62 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -61,7 +61,6 @@ MODEL_CLASSES = { 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer), 'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer), 'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer), - 'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer) } def set_seed(args): From a1f1dce0ae511ef7766c6b6a8f5ebf9118279e73 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 19 Dec 2019 12:25:55 -0500 Subject: [PATCH 240/302] Correct max position for SQUAD and TFDS --- transformers/data/processors/squad.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py index 84aa429e26..8e72bbbd6d 100644 --- a/transformers/data/processors/squad.py +++ b/transformers/data/processors/squad.py @@ -571,7 +571,9 @@ class SquadExample(object): # Start end end positions only has a value during evaluation. if start_position_character is not None and not is_impossible: self.start_position = char_to_word_offset[start_position_character] - self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1] + self.end_position = char_to_word_offset[ + min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1) + ] class SquadFeatures(object): From 33adab2b91697b3e78af618a21ab9f1176281165 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Thu, 19 Dec 2019 12:40:43 -0500 Subject: [PATCH 241/302] Fix albert example --- transformers/modeling_tf_albert.py | 4 ++-- transformers/modeling_utils.py | 5 ----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py index d1650d41a8..ac55a73fa3 100644 --- a/transformers/modeling_tf_albert.py +++ b/transformers/modeling_tf_albert.py @@ -587,8 +587,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel): import tensorflow as tf from transformers import AlbertTokenizer, TFAlbertModel - tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased') - model = TFAlbertModel.from_pretrained('bert-base-uncased') + tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1') + model = TFAlbertModel.from_pretrained('albert-base-v1') input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :] # Batch size 1 outputs = model(input_ids) last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 9bd99b25dc..eff54f71e1 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -327,11 +327,6 @@ class PreTrainedModel(nn.Module): model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config) """ - if pretrained_model_name_or_path is not None and ( - "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path): - logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " + - "https://github.com/google-research/google-research/issues/119 for more information.") - config = kwargs.pop('config', None) state_dict = kwargs.pop('state_dict', None) cache_dir = kwargs.pop('cache_dir', None) From 3492a6ec17e207a2830e061528eae9c53639c234 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 19:06:44 +0100 Subject: [PATCH 242/302] Addressing Thom's comments. --- transformers/pipelines.py | 70 ++++++++++++++++++--------------------- 1 file changed, 33 insertions(+), 37 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 71e6d0fbed..e4bf9e0894 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -30,6 +30,7 @@ from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, Pretrai SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger if is_tf_available(): + import tensorflow as tf from transformers import TFAutoModel, TFAutoModelForSequenceClassification, \ TFAutoModelForQuestionAnswering, TFAutoModelForTokenClassification @@ -79,9 +80,9 @@ class PipelineDataFormat: """ SUPPORTED_FORMATS = ['json', 'csv', 'pipe'] - def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): + def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): self.output = output - self.path = path + self.path = input self.column = column.split(',') if column else [''] self.is_multi_columns = len(self.column) > 1 @@ -92,7 +93,7 @@ class PipelineDataFormat: if exists(abspath(self.output)): raise OSError('{} already exists on disk'.format(self.output)) - if path is not None: + if input is not None: if not exists(abspath(self.path)): raise OSError('{} doesnt exist on disk'.format(self.path)) @@ -136,8 +137,8 @@ class PipelineDataFormat: class CsvPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): - super().__init__(output, path, column) + def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): + super().__init__(output, input, column) def __iter__(self): with open(self.path, 'r') as f: @@ -157,10 +158,10 @@ class CsvPipelineDataFormat(PipelineDataFormat): class JsonPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: Optional[str], path: Optional[str], column: Optional[str]): - super().__init__(output, path, column) + def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): + super().__init__(output, input, column) - with open(path, 'r') as f: + with open(input, 'r') as f: self._entries = json.load(f) def __iter__(self): @@ -321,11 +322,9 @@ class Pipeline(_ScikitCompat): Context manager """ if is_tf_available(): - import tensorflow as tf with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)): yield else: - import torch if self.device >= 0: torch.cuda.set_device(self.device) @@ -358,11 +357,10 @@ class Pipeline(_ScikitCompat): # Encode for forward with self.device_placement(): - # TODO : Remove this 512 hard-limit inputs = self.tokenizer.batch_encode_plus( inputs, add_special_tokens=True, return_tensors='tf' if is_tf_available() else 'pt', - max_length=512 + max_length=self.tokenizer.max_len ) # Filter out features not available on specific models @@ -379,11 +377,10 @@ class Pipeline(_ScikitCompat): """ if is_tf_available(): # TODO trace model - predictions = self.model(inputs)[0] + predictions = self.model(inputs, training=False)[0] else: - import torch with torch.no_grad(): - predictions = self.model(**inputs)[0] + predictions = self.model(**inputs).cpu()[0] return predictions.numpy() @@ -432,19 +429,18 @@ class NerPipeline(Pipeline): # Manage correct placement of the tensors with self.device_placement(): - # TODO : Remove this 512 hard-limit tokens = self.tokenizer.encode_plus( sentence, return_attention_mask=False, return_tensors='tf' if is_tf_available() else 'pt', - max_length=512 + max_length=self.tokenizer.max_len ) # Forward - if is_torch_available(): + if is_tf_available(): + entities = self.model(**tokens)[0][0].numpy() + else: with torch.no_grad(): entities = self.model(**tokens)[0][0].cpu().numpy() - else: - entities = self.model(tokens)[0][0].numpy() # Normalize scores answer, token_start = [], 1 @@ -484,28 +480,29 @@ class QuestionAnsweringArgumentHandler(ArgumentHandler): else: kwargs['X'] = list(args) - # Generic compatibility with sklearn and Keras - # Batched data + # Generic compatibility with sklearn and Keras + # Batched data if 'X' in kwargs or 'data' in kwargs: - data = kwargs['X'] if 'X' in kwargs else kwargs['data'] + inputs = kwargs['X'] if 'X' in kwargs else kwargs['data'] - if not isinstance(data, list): - data = [data] + if isinstance(inputs, dict): + inputs = [inputs] + else: + # Copy to avoid overriding arguments + inputs = [i for i in inputs] - for i, item in enumerate(data): + for i, item in enumerate(inputs): if isinstance(item, dict): if any(k not in item for k in ['question', 'context']): raise KeyError('You need to provide a dictionary with keys {question:..., context:...}') - data[i] = QuestionAnsweringPipeline.create_sample(**item) - elif isinstance(item, SquadExample): - continue - else: + inputs[i] = QuestionAnsweringPipeline.create_sample(**item) + + elif not isinstance(item, SquadExample): raise ValueError( '{} argument needs to be of type (list[SquadExample | dict], SquadExample, dict)' .format('X' if 'X' in kwargs else 'data') ) - inputs = data # Tabular input elif 'question' in kwargs and 'context' in kwargs: @@ -588,12 +585,10 @@ class QuestionAnsweringPipeline(Pipeline): # Manage tensor allocation on correct device with self.device_placement(): if is_tf_available(): - import tensorflow as tf fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() else: - import torch with torch.no_grad(): # Retrieve the score for the context tokens only (removing question tokens) fw_args = {k: torch.tensor(v) for (k, v) in fw_args.items()} @@ -812,8 +807,9 @@ def pipeline(task: str, model: Optional = None, if isinstance(config, str): config = AutoConfig.from_pretrained(config) - if allocator.__name__.startswith('TF'): - model = allocator.from_pretrained(model, config=config, from_pt=from_pt) - else: - model = allocator.from_pretrained(model, config=config, from_tf=from_tf) + if isinstance(model, str): + if allocator.__name__.startswith('TF'): + model = allocator.from_pretrained(model, config=config, from_pt=from_pt) + else: + model = allocator.from_pretrained(model, config=config, from_tf=from_tf) return task(model, tokenizer, **kwargs) From a305067f2d6ca74865c6d686608a1428e476a32f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Thu, 19 Dec 2019 19:41:48 +0100 Subject: [PATCH 243/302] Removed __main__ --- setup.py | 5 ----- transformers/__main__.py | 36 ------------------------------------ 2 files changed, 41 deletions(-) delete mode 100644 transformers/__main__.py diff --git a/setup.py b/setup.py index b3b6e2e063..56c4d1733b 100644 --- a/setup.py +++ b/setup.py @@ -68,11 +68,6 @@ setup( scripts=[ 'transformers-cli' ], - entry_points={ - 'console_scripts': [ - "transformers=transformers.__main__:main", - ] - }, # python_requires='>=3.5.0', classifiers=[ 'Intended Audience :: Science/Research', diff --git a/transformers/__main__.py b/transformers/__main__.py deleted file mode 100644 index a6e9ae65e0..0000000000 --- a/transformers/__main__.py +++ /dev/null @@ -1,36 +0,0 @@ -# coding: utf8 - -def main(): - import sys - if len(sys.argv) < 2 or sys.argv[1] not in ["convert", "train", "predict", "serve"]: - print( - "First argument to `transformers` command line interface should be one of: \n" - ">> convert serve train predict") - if sys.argv[1] == "convert": - from transformers.commands import convert - convert(sys.argv) - elif sys.argv[1] == "train": - from transformers.commands import train - train(sys.argv) - elif sys.argv[1] == "serve": - pass - # from argparse import ArgumentParser - # from transformers.commands.serving import ServeCommand - # parser = ArgumentParser('Transformers CLI tool', usage='transformers serve []') - # commands_parser = parser.add_subparsers(help='transformers-cli command helpers') - - # # Register commands - # ServeCommand.register_subcommand(commands_parser) - - # # Let's go - # args = parser.parse_args() - - # if not hasattr(args, 'func'): - # parser.print_help() - # exit(1) - # # Run - # service = args.func(args) - # service.run() - -if __name__ == '__main__': - main() From 149dc376aa5070db04cdf6e3358e87cab6670251 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 19 Dec 2019 20:34:28 +0100 Subject: [PATCH 244/302] fix tests --- transformers/configuration_utils.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 1aede2d6eb..f692c9b132 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -57,8 +57,6 @@ class PretrainedConfig(object): self.use_bfloat16 = kwargs.pop('use_bfloat16', False) self.pruned_heads = kwargs.pop('pruned_heads', {}) self.is_decoder = kwargs.pop('is_decoder', False) - self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)}) - self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys()))) # Fine-tuning task arguments self.finetuning_task = kwargs.pop('finetuning_task', None) From e4baa68ddbcb28488d7cef44ea5483a955d2effb Mon Sep 17 00:00:00 2001 From: thomwolf Date: Thu, 19 Dec 2019 20:37:26 +0100 Subject: [PATCH 245/302] tick-tock cc @julien-c --- .circleci/config.yml | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 9a81eea902..7a64eaba7d 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -80,17 +80,6 @@ jobs: - run: sudo pip install pytest - run: sudo pip install mecab-python3 - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py - build_py2_custom_tokenizers: - working_directory: ~/transformers - docker: - - image: circleci/python:2.7 - steps: - - checkout - - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest - - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig - - run: sudo pip install mecab-python - - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: working_directory: ~/transformers docker: @@ -124,7 +113,6 @@ workflows: jobs: - repository_consistency - build_py3_custom_tokenizers - - build_py2_custom_tokenizers - build_py3_torch_and_tf - build_py3_torch - build_py3_tf From 3376adc05157ba826acafd49f07f9a01ae30eb07 Mon Sep 17 00:00:00 2001 From: Stefan Schweter Date: Thu, 19 Dec 2019 21:30:23 +0100 Subject: [PATCH 246/302] configuration/modeling/tokenization: add various fine-tuned XLM-RoBERTa models for English, German, Spanish and Dutch (CoNLL datasets) --- transformers/configuration_xlm_roberta.py | 4 ++++ transformers/modeling_xlm_roberta.py | 4 ++++ transformers/tokenization_xlm_roberta.py | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py index d7a26538c5..5b6955f4f8 100644 --- a/transformers/configuration_xlm_roberta.py +++ b/transformers/configuration_xlm_roberta.py @@ -27,6 +27,10 @@ logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = { 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json", 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json", + 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json", + 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json", + 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json", + 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json", } diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py index 8095c46a16..0bdce941a5 100644 --- a/transformers/modeling_xlm_roberta.py +++ b/transformers/modeling_xlm_roberta.py @@ -29,6 +29,10 @@ logger = logging.getLogger(__name__) XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = { 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin", 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin", + 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin", + 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin", + 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin", + 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin", } diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index 453c4375c6..4397e7b031 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -32,6 +32,10 @@ PRETRAINED_VOCAB_FILES_MAP = { { 'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model", 'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model", + 'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model", + 'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model", + 'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model", + 'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model", } } From 9a399ead253e27792cbf0ef386cc39f9b7084f8f Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Thu, 19 Dec 2019 15:45:48 -0500 Subject: [PATCH 247/302] Revert incorrect #1778 --- transformers/modeling_tf_pytorch_utils.py | 3 --- transformers/modeling_utils.py | 2 -- 2 files changed, 5 deletions(-) diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py index d885fd23b3..190caff18d 100644 --- a/transformers/modeling_tf_pytorch_utils.py +++ b/transformers/modeling_tf_pytorch_utils.py @@ -119,9 +119,6 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') - # DialoGPT format - if key == 'lm_head.decoder.weight': - new_key = 'lm_head.weight' if new_key: old_keys.append(key) new_keys.append(new_key) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index eff54f71e1..cce234838a 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -437,8 +437,6 @@ class PreTrainedModel(nn.Module): new_key = key.replace('gamma', 'weight') if 'beta' in key: new_key = key.replace('beta', 'bias') - if key == 'lm_head.decoder.weight': - new_key = 'lm_head.weight' if new_key: old_keys.append(key) new_keys.append(new_key) From 1718fb9e7444a0883a550ab90c61bb8bf8a97076 Mon Sep 17 00:00:00 2001 From: Aidan Kierans <31550769+aidankierans@users.noreply.github.com> Date: Thu, 19 Dec 2019 16:23:18 -0500 Subject: [PATCH 248/302] Minor/basic text fixes (#2229) * Small clarification Matches line 431 to line 435 for additional clarity and consistency. * Fixed minor typo The letter "s" was previously omitted from the word "docstrings". --- CONTRIBUTING.md | 2 +- examples/run_lm_finetuning.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8228dd59d8..7d7f2c73ff 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -168,7 +168,7 @@ Follow these steps to start contributing: to be merged; 4. Make sure pre-existing tests still pass; 5. Add high-coverage tests. No quality test, no merge; -6. All public methods must have informative doctrings; +6. All public methods must have informative docstrings; ### Style guide diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index c4c73e71af..d8127e24a5 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -428,9 +428,9 @@ def main(): parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam.") + help="The initial learning rate for Adam optimizer.") parser.add_argument("--weight_decay", default=0.0, type=float, - help="Weight deay if we apply some.") + help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument("--max_grad_norm", default=1.0, type=float, From a5a06a851e1da79138e53978aa079a093f243dde Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Thu, 19 Dec 2019 16:24:20 -0500 Subject: [PATCH 249/302] [doc] Param name consistency --- examples/run_lm_finetuning.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py index d8127e24a5..75848d5acc 100644 --- a/examples/run_lm_finetuning.py +++ b/examples/run_lm_finetuning.py @@ -428,7 +428,7 @@ def main(): parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument("--learning_rate", default=5e-5, type=float, - help="The initial learning rate for Adam optimizer.") + help="The initial learning rate for Adam.") parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.") parser.add_argument("--adam_epsilon", default=1e-8, type=float, From f25e9b6f771ea1a10f4525bdb212f841efcdbd3a Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Thu, 19 Dec 2019 18:28:17 -0500 Subject: [PATCH 250/302] [hf_bucket_url] support for cloudfront urls --- transformers/file_utils.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 16010f7e0a..4a9de9b53c 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -77,6 +77,7 @@ DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert" +CLOUDFRONT_DISTRIB_PREFIX = "https://d2ws9o8vfrpkyk.cloudfront.net" def is_torch_available(): @@ -114,11 +115,12 @@ def is_remote_url(url_or_filename): parsed = urlparse(url_or_filename) return parsed.scheme in ('http', 'https', 's3') -def hf_bucket_url(identifier, postfix=None): +def hf_bucket_url(identifier, postfix=None, cdn=False): + endpoint = CLOUDFRONT_DISTRIB_PREFIX if cdn else S3_BUCKET_PREFIX if postfix is None: - return "/".join((S3_BUCKET_PREFIX, identifier)) + return "/".join((endpoint, identifier)) else: - return "/".join((S3_BUCKET_PREFIX, identifier, postfix)) + return "/".join((endpoint, identifier, postfix)) def url_to_filename(url, etag=None): @@ -126,7 +128,7 @@ def url_to_filename(url, etag=None): Convert `url` into a hashed filename in a repeatable way. If `etag` is specified, append its hash to the url's, delimited by a period. - If the url ends with .h5 (Keras HDF5 weights) ands '.h5' to the name + If the url ends with .h5 (Keras HDF5 weights) adds '.h5' to the name so that TF 2.0 can identify it as a HDF5 file (see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1380) """ From 15d897ff4a29e851285844e82763155a3f9f86b0 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Thu, 19 Dec 2019 18:29:22 -0500 Subject: [PATCH 251/302] [http] customizable requests user-agent --- transformers/file_utils.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 4a9de9b53c..611202159d 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -23,6 +23,7 @@ from botocore.exceptions import ClientError import requests from tqdm.auto import tqdm from contextlib import contextmanager +from . import __version__ logger = logging.getLogger(__name__) # pylint: disable=invalid-name @@ -173,7 +174,7 @@ def filename_to_url(filename, cache_dir=None): return url, etag -def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False): +def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None, resume_download=False, user_agent=None): """ Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and @@ -183,6 +184,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N cache_dir: specify a cache directory to save the file to (overwrite the default cache dir). force_download: if True, re-dowload the file even if it's already cached in the cache dir. resume_download: if True, resume the download if incompletly recieved file is found. + user_agent: Optional string or dict that will be appended to the user-agent on remote requests. """ if cache_dir is None: cache_dir = TRANSFORMERS_CACHE @@ -195,7 +197,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N # URL, so get it from the cache (downloading if necessary) return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies, - resume_download=resume_download) + resume_download=resume_download, user_agent=user_agent) elif os.path.exists(url_or_filename): # File, and it exists. return url_or_filename @@ -256,8 +258,19 @@ def s3_get(url, temp_file, proxies=None): s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) -def http_get(url, temp_file, proxies=None, resume_size=0): - headers={'Range':'bytes=%d-'%(resume_size,)} if resume_size > 0 else None +def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): + ua = "transformers/{}; python/{}".format(__version__, sys.version.split()[0]) + if isinstance(user_agent, dict): + ua += "; " + "; ".join( + "{}/{}".format(k, v) for k, v in user_agent.items() + ) + elif isinstance(user_agent, six.string_types): + ua += "; "+ user_agent + headers = { + "user-agent": ua + } + if resume_size > 0: + headers['Range'] = 'bytes=%d-' % (resume_size,) response = requests.get(url, stream=True, proxies=proxies, headers=headers) if response.status_code == 416: # Range not satisfiable return @@ -271,7 +284,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0): progress.close() -def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False): +def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10, resume_download=False, user_agent=None): """ Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file. @@ -342,7 +355,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag logger.warn('Warning: resumable downloads are not implemented for "s3://" urls') s3_get(url, temp_file, proxies=proxies) else: - http_get(url, temp_file, proxies=proxies, resume_size=resume_size) + http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() From 9d0d1cd339ca97e6a40e1f898fa3d3350e46e350 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 20 Dec 2019 09:30:37 +0100 Subject: [PATCH 252/302] Filter out entity for NER task. --- transformers/pipelines.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index e4bf9e0894..6b805a06db 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -450,11 +450,12 @@ class NerPipeline(Pipeline): score = np.exp(entities[token_start]) / np.exp(entities[token_start]).sum(-1, keepdims=True) label_idx = score.argmax() - answer += [{ - 'word': words[idx], - 'score': score[label_idx].item(), - 'entity': self.model.config.id2label[label_idx] - }] + if label_idx > 0: + answer += [{ + 'word': words[idx], + 'score': score[label_idx].item(), + 'entity': self.model.config.id2label[label_idx] + }] # Update token start token_start += len(list(word)) From e516a34a158593f82b6f22dc1e568ff8996f0389 Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 20 Dec 2019 09:38:08 +0100 Subject: [PATCH 253/302] Use BasicTokenizer to split over whitespaces. --- transformers/pipelines.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 6b805a06db..c3e0ad1a8f 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -27,7 +27,7 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ - SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger + SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger, BasicTokenizer if is_tf_available(): import tensorflow as tf @@ -416,12 +416,19 @@ class NerPipeline(Pipeline): Named Entity Recognition pipeline using ModelForTokenClassification head. """ + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, + args_parser: ArgumentHandler = None, device: int = -1, + binary_output: bool = False): + super().__init__(model, tokenizer, args_parser, device, binary_output) + + self._basic_tokenizer = BasicTokenizer(do_lower_case=False) + def __call__(self, *texts, **kwargs): inputs, answers = self._args_parser(*texts, **kwargs), [] for sentence in inputs: # Ugly token to word idx mapping (for now) - token_to_word, words = [], sentence.split(' ') + token_to_word, words = [], self._basic_tokenizer.tokenize(sentence) for i, w in enumerate(words): tokens = self.tokenizer.tokenize(w) token_to_word += [i] * len(tokens) From 61d9ee45e3f19b9c661e078e7f57dbe8fb8c812c Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 20 Dec 2019 11:47:56 +0100 Subject: [PATCH 254/302] All tests are green. --- transformers/pipelines.py | 9 +- transformers/tests/pipelines_test.py | 247 ++++++++++++++++----------- 2 files changed, 154 insertions(+), 102 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index c3e0ad1a8f..4dde62cbe5 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -343,8 +343,9 @@ class Pipeline(_ScikitCompat): if 'distilbert' not in model_type and 'xlm' not in model_type: args += ['token_type_ids'] - if 'xlnet' in model_type or 'xlm' in model_type: - args += ['cls_index', 'p_mask'] + # PR #1548 (CLI) There is an issue with attention_mask + # if 'xlnet' in model_type or 'xlm' in model_type: + # args += ['cls_index', 'p_mask'] if isinstance(features, dict): return {k: features[k] for k in args} @@ -380,7 +381,7 @@ class Pipeline(_ScikitCompat): predictions = self.model(inputs, training=False)[0] else: with torch.no_grad(): - predictions = self.model(**inputs).cpu()[0] + predictions = self.model(**inputs)[0].cpu() return predictions.numpy() @@ -444,7 +445,7 @@ class NerPipeline(Pipeline): # Forward if is_tf_available(): - entities = self.model(**tokens)[0][0].numpy() + entities = self.model(tokens)[0][0].numpy() else: with torch.no_grad(): entities = self.model(**tokens)[0][0].cpu().numpy() diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py index ee10234269..a8fe668221 100644 --- a/transformers/tests/pipelines_test.py +++ b/transformers/tests/pipelines_test.py @@ -1,113 +1,164 @@ import unittest from unittest.mock import patch +from typing import Iterable + +from transformers import pipeline +from transformers.tests.utils import require_tf, require_torch QA_FINETUNED_MODELS = { - 'bert-large-uncased-whole-word-masking-finetuned-squad', - 'bert-large-cased-whole-word-masking-finetuned-squad', - 'distilbert-base-uncased-distilled-squad', + ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None), + ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None), + ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None) +} +NER_FINETUNED_MODELS = { + ( + 'bert-base-cased', + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json' + ) +} + +FEATURE_EXTRACT_FINETUNED_MODELS = { + ('bert-base-cased', 'bert-base-cased', None), + # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 + ('distilbert-base-uncased', 'distilbert-base-uncased', None) +} + +TEXT_CLASSIF_FINETUNED_MODELS = { + ( + 'bert-base-uncased', + 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', + 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json' + ) } -class QuestionAnsweringPipelineTest(unittest.TestCase): - def check_answer_structure(self, answer, batch, topk): - self.assertIsInstance(answer, list) - self.assertEqual(len(answer), batch) - self.assertIsInstance(answer[0], list) - self.assertEqual(len(answer[0]), topk) - self.assertIsInstance(answer[0][0], dict) - - for item in answer[0]: - self.assertTrue('start' in item) - self.assertTrue('end' in item) - self.assertTrue('score' in item) - self.assertTrue('answer' in item) - - def question_answering_pipeline(self, nlp): - # Simple case with topk = 1, no batching - a = nlp(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.') - self.check_answer_structure(a, 1, 1) - - # Simple case with topk = 2, no batching - a = nlp(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.', topk=2) - self.check_answer_structure(a, 1, 2) - - # Batch case with topk = 1 - a = nlp(question=['What is the name of the company I\'m working for ?', 'Where is the company based ?'], - context=['I\'m working for Huggingface.', 'The company is based in New York and Paris']) - self.check_answer_structure(a, 2, 1) - - # Batch case with topk = 2 - a = nlp(question=['What is the name of the company I\'m working for ?', 'Where is the company based ?'], - context=['Where is the company based ?', 'The company is based in New York and Paris'], topk=2) - self.check_answer_structure(a, 2, 2) - - # check for data keyword - a = nlp(data=nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.')) - self.check_answer_structure(a, 1, 1) - - a = nlp(data=nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.'), topk=2) - self.check_answer_structure(a, 1, 2) - - a = nlp(data=[ - nlp.create_sample(question='What is the name of the company I\'m working for ?', context='I\'m working for Huggingface.'), - nlp.create_sample(question='I\'m working for Huggingface.', context='The company is based in New York and Paris'), - ]) - self.check_answer_structure(a, 2, 1) - - a = nlp(data=[ - {'question': 'What is the name of the company I\'m working for ?', 'context': 'I\'m working for Huggingface.'}, - {'question': 'Where is the company based ?', 'context': 'The company is based in New York and Paris'}, - ]) - self.check_answer_structure(a, 2, 1) - - # X keywords - a = nlp(X=nlp.create_sample( - question='Where is the company based ?', context='The company is based in New York and Paris' - )) - self.check_answer_structure(a, 1, 1) - - a = nlp(X=[ - {'question': 'What is the name of the company I\'m working for ?', 'context': 'I\'m working for Huggingface.'}, - {'question': 'Where is the company based ?', 'context': 'The company is based in New York and Paris'}, - ], topk=2) - self.check_answer_structure(a, 2, 2) - - @patch('transformers.pipelines.is_torch_available', return_value=False) - def test_tf_models(self, is_torch_available): - from transformers import pipeline - for model in QA_FINETUNED_MODELS: - self.question_answering_pipeline(pipeline('question-answering', model)) - - @patch('transformers.pipelines.is_tf_available', return_value=False) - @patch('transformers.tokenization_utils.is_tf_available', return_value=False) - def test_torch_models(self, is_tf_available, _): - from transformers import pipeline - for model in QA_FINETUNED_MODELS: - self.question_answering_pipeline(pipeline('question-answering', model)) +@require_tf +def tf_pipeline(*args, **kwargs): + return pipeline(**kwargs) -class AutoPipelineTest(unittest.TestCase): - @patch('transformers.pipelines.is_torch_available', return_value=False) - def test_tf_qa(self, is_torch_available): - from transformers import pipeline - from transformers.pipelines import QuestionAnsweringPipeline - from transformers.modeling_tf_utils import TFPreTrainedModel - for model in QA_FINETUNED_MODELS: - nlp = pipeline('question-answering', model) - self.assertIsInstance(nlp, QuestionAnsweringPipeline) - self.assertIsInstance(nlp.model, TFPreTrainedModel) +@require_torch +def torch_pipeline(*args, **kwargs): + return pipeline(**kwargs) - @patch('transformers.pipelines.is_tf_available', return_value=False) - def test_torch_qa(self, is_tf_available): - from transformers import pipeline - from transformers.pipelines import QuestionAnsweringPipeline - from transformers.modeling_utils import PreTrainedModel - for model in QA_FINETUNED_MODELS: - nlp = pipeline('question-answering', model) - self.assertIsInstance(nlp, QuestionAnsweringPipeline) - self.assertIsInstance(nlp.model, PreTrainedModel) + +class MonoColumnInputTestCase(unittest.TestCase): + def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): + self.assertIsNotNone(nlp) + + mono_result = nlp(valid_inputs[0]) + self.assertIsInstance(mono_result, list) + self.assertIsInstance(mono_result[0], (dict, list)) + + if isinstance(mono_result[0], list): + mono_result = mono_result[0] + + for key in output_keys: + self.assertIn(key, mono_result[0]) + + multi_result = nlp(valid_inputs) + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], (dict, list)) + + if isinstance(multi_result[0], list): + multi_result = multi_result[0] + + for result in multi_result: + for key in output_keys: + self.assertIn(key, result) + + self.assertRaises(Exception, nlp, invalid_inputs) + + def test_ner(self): + mandatory_keys = {'entity', 'word', 'score'} + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in NER_FINETUNED_MODELS: + with patch('transformers.pipelines.is_torch_available', return_value=False): + nlp = tf_pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + + with patch('transformers.pipelines.is_tf_available', return_value=False): + nlp = torch_pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + + def test_sentiment_analysis(self): + mandatory_keys = {'label'} + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: + with patch('transformers.pipelines.is_torch_available', return_value=False): + nlp = tf_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + + with patch('transformers.pipelines.is_tf_available', return_value=False): + nlp = torch_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + + def test_features_extraction(self): + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: + with patch('transformers.pipelines.is_torch_available', return_value=False): + nlp = tf_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) + + with patch('transformers.pipelines.is_tf_available', return_value=False): + nlp = torch_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) + + +class MultiColumnInputTestCase(unittest.TestCase): + def _test_multicolumn_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): + self.assertIsNotNone(nlp) + + mono_result = nlp(valid_inputs[0]) + self.assertIsInstance(mono_result, dict) + + for key in output_keys: + self.assertIn(key, mono_result) + + multi_result = nlp(valid_inputs) + self.assertIsInstance(multi_result, list) + self.assertIsInstance(multi_result[0], dict) + + for result in multi_result: + for key in output_keys: + self.assertIn(key, result) + + self.assertRaises(Exception, nlp, invalid_inputs[0]) + self.assertRaises(Exception, nlp, invalid_inputs) + + def test_question_answering(self): + mandatory_output_keys = {'score', 'answer', 'start', 'end'} + valid_samples = [ + {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'}, + { + 'question': 'In what field is HuggingFace working ?', + 'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.' + } + ] + invalid_samples = [ + {'question': '', 'context': 'This is a test to try empty question edge case'}, + {'question': None, 'context': 'This is a test to try empty question edge case'}, + {'question': 'What is does with empty context ?', 'context': ''}, + {'question': 'What is does with empty context ?', 'context': None}, + ] + + for tokenizer, model, config in QA_FINETUNED_MODELS: + + # Test for Tensorflow + with patch('transformers.pipelines.is_torch_available', return_value=False): + nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) + + # Test for PyTorch + with patch('transformers.pipelines.is_tf_available', return_value=False): + nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) if __name__ == '__main__': From ca6bdb28f64763ccad6c5b0aef36049c797f1ef7 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 12:10:40 +0100 Subject: [PATCH 255/302] fix pipelines and rename model_card => modelcard --- transformers/__init__.py | 2 +- transformers/file_utils.py | 4 +- transformers/{model_card.py => modelcard.py} | 55 ++++----- transformers/pipelines.py | 112 ++++++++++++------- transformers/tests/model_card_test.py | 28 ++--- 5 files changed, 118 insertions(+), 83 deletions(-) rename transformers/{model_card.py => modelcard.py} (83%) diff --git a/transformers/__init__.py b/transformers/__init__.py index 80f140b31e..73a7f5d862 100755 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -35,7 +35,7 @@ if is_sklearn_available(): from .data import glue_compute_metrics, xnli_compute_metrics # Model Cards -from .model_card import ModelCard +from .modelcard import ModelCard # Tokenizers from .tokenization_utils import (PreTrainedTokenizer) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 8a1c4db201..47fa588815 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -81,7 +81,7 @@ WEIGHTS_NAME = "pytorch_model.bin" TF2_WEIGHTS_NAME = 'tf_model.h5' TF_WEIGHTS_NAME = 'model.ckpt' CONFIG_NAME = "config.json" -MODEL_CARD_NAME = "model_card.json" +MODEL_CARD_NAME = "modelcard.json" DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]] DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]] @@ -339,7 +339,7 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag temp_file_manager = tempfile.NamedTemporaryFile resume_size = 0 - if not os.path.exists(cache_path) or force_download: + if etag is not None and (not os.path.exists(cache_path) or force_download): # Download to temporary file, then copy to cache dir once finished. # Otherwise you get corrupt cache entries if the download gets interrupted. with temp_file_manager() as temp_file: diff --git a/transformers/model_card.py b/transformers/modelcard.py similarity index 83% rename from transformers/model_card.py rename to transformers/modelcard.py index baec7e8622..4a879235ae 100644 --- a/transformers/model_card.py +++ b/transformers/modelcard.py @@ -25,7 +25,8 @@ from io import open from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP -from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url +from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, WEIGHTS_NAME, TF2_WEIGHTS_NAME, \ + cached_path, is_remote_url, hf_bucket_url logger = logging.getLogger(__name__) @@ -89,7 +90,7 @@ class ModelCard(object): - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``. - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``. - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``. - - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``. + - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/modelcard.json``. cache_dir: (`optional`) string: Path to a directory in which a downloaded pre-trained model @@ -100,16 +101,14 @@ class ModelCard(object): - The values in kwargs of any keys which are model card attributes will be used to override the loaded values. - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter. - force_download: (`optional`) boolean, default False: - Force to (re-)download the model card file and override the cached version if it exists. - - resume_download: (`optional`) boolean, default False: - Do not delete incompletely recieved file. Attempt to resume the download if such a file exists. - proxies: (`optional`) dict, default None: A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}. The proxies are used on each request. + find_from_standard_name: (`optional`) boolean, default True: + If the pretrained_model_name_or_path ends with our standard model or config filenames, replace them with our standard modelcard filename. + Can be used to directly feed a model/config url and access the colocated modelcard. + return_unused_kwargs: (`optional`) bool: - If False, then this function returns just the final model card object. @@ -117,22 +116,21 @@ class ModelCard(object): Examples:: - model_card = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. - model_card = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` - model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json') - model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) + modelcard = ModelCard.from_pretrained('bert-base-uncased') # Download model card from S3 and cache. + modelcard = ModelCard.from_pretrained('./test/saved_model/') # E.g. model card was saved using `save_pretrained('./test/saved_model/')` + modelcard = ModelCard.from_pretrained('./test/saved_model/modelcard.json') + modelcard = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False) """ cache_dir = kwargs.pop('cache_dir', None) - force_download = kwargs.pop('force_download', False) - resume_download = kwargs.pop('resume_download', False) proxies = kwargs.pop('proxies', None) + find_from_standard_name = kwargs.pop('find_from_standard_name', True) return_unused_kwargs = kwargs.pop('return_unused_kwargs', False) if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: - # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json) + # For simplicity we use the same pretrained url than the configuration files + # but with a different suffix (modelcard.json). This suffix is replaced below. model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path] - model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) elif os.path.isdir(pretrained_model_name_or_path): model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME) elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): @@ -140,17 +138,22 @@ class ModelCard(object): else: model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME) + if find_from_standard_name or pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME) + model_card_file = model_card_file.replace(WEIGHTS_NAME, MODEL_CARD_NAME) + model_card_file = model_card_file.replace(TF2_WEIGHTS_NAME, MODEL_CARD_NAME) + try: # Load from URL or cache if already cached - resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download, - proxies=proxies, resume_download=resume_download) + resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=True, + proxies=proxies, resume_download=False) if resolved_model_card_file == model_card_file: logger.info("loading model card file {}".format(model_card_file)) else: logger.info("loading model card file {} from cache at {}".format( model_card_file, resolved_model_card_file)) # Load model card - model_card = cls.from_json_file(resolved_model_card_file) + modelcard = cls.from_json_file(resolved_model_card_file) except EnvironmentError: if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: @@ -166,7 +169,7 @@ class ModelCard(object): logger.warning("Creating an empty model card.") # We fall back on creating an empty model card - model_card = cls() + modelcard = cls() except json.JSONDecodeError: logger.warning("Couldn't reach server at '{}' to download model card file or " @@ -175,22 +178,22 @@ class ModelCard(object): logger.warning("Creating an empty model card.") # We fall back on creating an empty model card - model_card = cls() + modelcard = cls() # Update model card with kwargs if needed to_remove = [] for key, value in kwargs.items(): - if hasattr(model_card, key): - setattr(model_card, key, value) + if hasattr(modelcard, key): + setattr(modelcard, key, value) to_remove.append(key) for key in to_remove: kwargs.pop(key, None) - logger.info("Model card: %s", str(model_card)) + logger.info("Model card: %s", str(modelcard)) if return_unused_kwargs: - return model_card, kwargs + return modelcard, kwargs else: - return model_card + return modelcard @classmethod def from_dict(cls, json_object): diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 4dde62cbe5..be2b1db126 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -18,6 +18,8 @@ import csv import json import os import pickle +import logging +import six from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import groupby @@ -26,8 +28,12 @@ from typing import Union, Optional, Tuple, List, Dict import numpy as np -from transformers import AutoConfig, AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, \ - SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, logger, BasicTokenizer +from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, + PretrainedConfig, ModelCard, SquadExample, + squad_convert_examples_to_features, is_tf_available, + is_torch_available, BasicTokenizer, + ALL_PRETRAINED_MODEL_ARCHIVE_MAP, + ALL_PRETRAINED_CONFIG_ARCHIVE_MAP) if is_tf_available(): import tensorflow as tf @@ -40,6 +46,8 @@ if is_torch_available(): AutoModelForQuestionAnswering, AutoModelForTokenClassification +logger = logging.getLogger(__name__) + class ArgumentHandler(ABC): """ Base interface for handling varargs for each Pipeline @@ -271,11 +279,13 @@ class Pipeline(_ScikitCompat): nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...') """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False): self.model = model self.tokenizer = tokenizer + self.modelcard = modelcard self.device = device self.binary_output = binary_output self._args_parser = args_parser or DefaultArgumentHandler() @@ -294,6 +304,7 @@ class Pipeline(_ScikitCompat): self.model.save_pretrained(save_directory) self.tokenizer.save_pretrained(save_directory) + self.modelcard.save_pretrained(save_directory) def transform(self, X): """ @@ -393,9 +404,10 @@ class FeatureExtractionPipeline(Pipeline): def __init__(self, model, tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, args_parser: ArgumentHandler = None, device: int = -1): - super().__init__(model, tokenizer, args_parser, device, binary_output=True) + super().__init__(model, tokenizer, modelcard, args_parser, device, binary_output=True) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() @@ -418,9 +430,10 @@ class NerPipeline(Pipeline): """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, + modelcard: ModelCard = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False): - super().__init__(model, tokenizer, args_parser, device, binary_output) + super().__init__(model, tokenizer, modelcard, args_parser, device, binary_output) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) @@ -554,8 +567,10 @@ class QuestionAnsweringPipeline(Pipeline): else: return SquadExample(None, question, context, None, None, None) - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer], device: int = -1, **kwargs): - super().__init__(model, tokenizer, args_parser=QuestionAnsweringArgumentHandler(), + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer], + modelcard: Optional[ModelCard], + device: int = -1, **kwargs): + super().__init__(model, tokenizer, modelcard, args_parser=QuestionAnsweringArgumentHandler(), device=device, **kwargs) def __call__(self, *texts, **kwargs): @@ -725,7 +740,7 @@ SUPPORTED_TASKS = { 'default': { 'model': 'distilbert-base-uncased', 'config': None, - 'tokenizer': 'bert-base-uncased' + 'tokenizer': 'distilbert-base-uncased' } }, 'sentiment-analysis': { @@ -735,7 +750,7 @@ SUPPORTED_TASKS = { 'default': { 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json', - 'tokenizer': 'bert-base-uncased' + 'tokenizer': 'distilbert-base-uncased' } }, 'ner': { @@ -745,7 +760,7 @@ SUPPORTED_TASKS = { 'default': { 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json', - 'tokenizer': 'bert-base-cased' + 'tokenizer': 'bert-large-cased' } }, 'question-answering': { @@ -755,7 +770,7 @@ SUPPORTED_TASKS = { 'default': { 'model': 'distilbert-base-uncased-distilled-squad', 'config': None, - 'tokenizer': 'bert-base-uncased' + 'tokenizer': 'distilbert-base-uncased' } } } @@ -763,7 +778,9 @@ SUPPORTED_TASKS = { def pipeline(task: str, model: Optional = None, config: Optional[Union[str, PretrainedConfig]] = None, - tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, **kwargs) -> Pipeline: + tokenizer: Optional[Union[str, PreTrainedTokenizer]] = None, + modelcard: Optional[Union[str, ModelCard]] = None, + **kwargs) -> Pipeline: """ Utility factory method to build a pipeline. Pipeline are made of: @@ -777,48 +794,63 @@ def pipeline(task: str, model: Optional = None, pipeline('ner', model=AutoModel.from_pretrained(...), tokenizer=AutoTokenizer.from_pretrained(...) pipeline('ner', model='https://...pytorch-model.bin', config='https://...config.json', tokenizer='bert-base-cased') """ - # Try to infer tokenizer from model name (if provided as str) - if tokenizer is None: - if model is not None and not isinstance(model, str): - # Impossible to guest what is the right tokenizer here - raise Exception('Tokenizer cannot be None if provided model is a PreTrainedModel instance') - else: - tokenizer = model - # Retrieve the task if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) - targeted_task = SUPPORTED_TASKS[task] - task, allocator = targeted_task['impl'], targeted_task['tf'] if is_tf_available() else targeted_task['pt'] + pipeline_framework = 'tf' if is_tf_available() else ('pt' if is_torch_available() else None) + if pipeline_framework is None: + raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/.") - # Handling for default model for the task + + targeted_task = SUPPORTED_TASKS[task] + task, model_class = targeted_task['impl'], targeted_task[pipeline_framework] + + # Use default model/config/tokenizer for the task if no model is provided if model is None: model, config, tokenizer = tuple(targeted_task['default'].values()) - # Allocate tokenizer - tokenizer = tokenizer if isinstance(tokenizer, PreTrainedTokenizer) else AutoTokenizer.from_pretrained(tokenizer) + # Try to infer tokenizer from model or config name (if provided as str) + if tokenizer is None: + if isinstance(model, str) and model in ALL_PRETRAINED_MODEL_ARCHIVE_MAP: + tokenizer = model + elif isinstance(config, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + tokenizer = config + else: + # Impossible to guest what is the right tokenizer here + raise Exception("Impossible to guess which tokenizer to use. " + "Please provided a PretrainedTokenizer class or a path/url/shortcut name to a pretrained tokenizer.") - # Special handling for model conversion - if isinstance(model, str): - from_tf = model.endswith('.h5') and not is_tf_available() - from_pt = model.endswith('.bin') and not is_torch_available() + # Try to infer modelcard from model or config name (if provided as str) + if modelcard is None: + # Try to fallback on one of the provided string for model or config (will replace the suffix) + if isinstance(model, str): + modelcard = model + elif isinstance(config, str): + modelcard = config - if from_tf: - logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. ' - 'Trying to load the model with PyTorch.') - elif from_pt: - logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. ' - 'Trying to load the model with Tensorflow.') - else: - from_tf = from_pt = False + # Instantiate tokenizer if needed + if isinstance(tokenizer, six.string_types): + tokenizer = AutoTokenizer.from_pretrained(tokenizer) + # Instantiate config if needed if isinstance(config, str): config = AutoConfig.from_pretrained(config) + # Instantiate model if needed if isinstance(model, str): - if allocator.__name__.startswith('TF'): - model = allocator.from_pretrained(model, config=config, from_pt=from_pt) - else: - model = allocator.from_pretrained(model, config=config, from_tf=from_tf) + # Handle transparent TF/PT model conversion + model_kwargs = {} + if pipeline_framework == 'pt' and model.endswith('.h5'): + model_kwargs['from_tf'] = True + logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. ' + 'Trying to load the model with PyTorch.') + elif pipeline_framework == 'tf' and model.endswith('.bin'): + model_kwargs['from_pt'] = True + logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. ' + 'Trying to load the model with Tensorflow.') + model = model_class.from_pretrained(model, config=config, **model_kwargs) + return task(model, tokenizer, **kwargs) diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py index e75716f0aa..b293b5726a 100644 --- a/transformers/tests/model_card_test.py +++ b/transformers/tests/model_card_test.py @@ -18,7 +18,7 @@ import os import json import unittest -from transformers.model_card import ModelCard +from transformers.modelcard import ModelCard from .tokenization_tests_commons import TemporaryDirectory class ModelCardTester(unittest.TestCase): @@ -49,20 +49,20 @@ class ModelCardTester(unittest.TestCase): } def test_model_card_common_properties(self): - model_card = ModelCard.from_dict(self.inputs_dict) - self.assertTrue(hasattr(model_card, 'model_details')) - self.assertTrue(hasattr(model_card, 'intended_use')) - self.assertTrue(hasattr(model_card, 'factors')) - self.assertTrue(hasattr(model_card, 'metrics')) - self.assertTrue(hasattr(model_card, 'evaluation_data')) - self.assertTrue(hasattr(model_card, 'training_data')) - self.assertTrue(hasattr(model_card, 'quantitative_analyses')) - self.assertTrue(hasattr(model_card, 'ethical_considerations')) - self.assertTrue(hasattr(model_card, 'caveats_and_recommendations')) + modelcard = ModelCard.from_dict(self.inputs_dict) + self.assertTrue(hasattr(modelcard, 'model_details')) + self.assertTrue(hasattr(modelcard, 'intended_use')) + self.assertTrue(hasattr(modelcard, 'factors')) + self.assertTrue(hasattr(modelcard, 'metrics')) + self.assertTrue(hasattr(modelcard, 'evaluation_data')) + self.assertTrue(hasattr(modelcard, 'training_data')) + self.assertTrue(hasattr(modelcard, 'quantitative_analyses')) + self.assertTrue(hasattr(modelcard, 'ethical_considerations')) + self.assertTrue(hasattr(modelcard, 'caveats_and_recommendations')) def test_model_card_to_json_string(self): - model_card = ModelCard.from_dict(self.inputs_dict) - obj = json.loads(model_card.to_json_string()) + modelcard = ModelCard.from_dict(self.inputs_dict) + obj = json.loads(modelcard.to_json_string()) for key, value in self.inputs_dict.items(): self.assertEqual(obj[key], value) @@ -70,7 +70,7 @@ class ModelCardTester(unittest.TestCase): model_card_first = ModelCard.from_dict(self.inputs_dict) with TemporaryDirectory() as tmpdirname: - filename = os.path.join(tmpdirname, u"model_card.json") + filename = os.path.join(tmpdirname, u"modelcard.json") model_card_first.to_json_file(filename) model_card_second = ModelCard.from_json_file(filename) From 1fa93ca1eaa249321ef39994e9f022d0799034a3 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 12:34:19 +0100 Subject: [PATCH 256/302] Clean up framework handling --- transformers/pipelines.py | 85 ++++++++++++++++++++++++++------------- 1 file changed, 58 insertions(+), 27 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index be2b1db126..1c56033f7c 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -48,6 +48,19 @@ if is_torch_available(): logger = logging.getLogger(__name__) +def get_framework(model=None): + if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): + # Both framework are available but the use supplied a model class instance. + # Try to guess which framework to use from the model classname + framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt' + else: + framework = 'tf' if is_tf_available() else ('pt' if is_torch_available() else None) + if framework is None: + raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/.") + return framework + class ArgumentHandler(ABC): """ Base interface for handling varargs for each Pipeline @@ -279,19 +292,23 @@ class Pipeline(_ScikitCompat): nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...') """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, - modelcard: ModelCard = None, + modelcard: ModelCard = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False): + if framework is None: + framework = get_framework() + self.model = model self.tokenizer = tokenizer self.modelcard = modelcard + self.framework = framework self.device = device self.binary_output = binary_output self._args_parser = args_parser or DefaultArgumentHandler() # Special handling - if self.device >= 0 and not is_tf_available(): + if self.device >= 0 and self.framework == 'pt': self.model = self.model.to('cuda:{}'.format(self.device)) def save_pretrained(self, save_directory): @@ -332,7 +349,7 @@ class Pipeline(_ScikitCompat): Returns: Context manager """ - if is_tf_available(): + if self.framework == 'tf': with tf.device('/CPU:0' if self.device == -1 else '/device:GPU:{}'.format(self.device)): yield else: @@ -371,7 +388,7 @@ class Pipeline(_ScikitCompat): with self.device_placement(): inputs = self.tokenizer.batch_encode_plus( inputs, add_special_tokens=True, - return_tensors='tf' if is_tf_available() else 'pt', + return_tensors=self.framework, max_length=self.tokenizer.max_len ) @@ -387,7 +404,7 @@ class Pipeline(_ScikitCompat): Returns: Numpy array """ - if is_tf_available(): + if self.framework == 'tf': # TODO trace model predictions = self.model(inputs, training=False)[0] else: @@ -405,9 +422,16 @@ class FeatureExtractionPipeline(Pipeline): def __init__(self, model, tokenizer: PreTrainedTokenizer = None, modelcard: ModelCard = None, + framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1): - super().__init__(model, tokenizer, modelcard, args_parser, device, binary_output=True) + super().__init__(model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=True) def __call__(self, *args, **kwargs): return super().__call__(*args, **kwargs).tolist() @@ -430,10 +454,16 @@ class NerPipeline(Pipeline): """ def __init__(self, model, tokenizer: PreTrainedTokenizer = None, - modelcard: ModelCard = None, + modelcard: ModelCard = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, binary_output: bool = False): - super().__init__(model, tokenizer, modelcard, args_parser, device, binary_output) + super().__init__(model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=args_parser, + device=device, + binary_output=binary_output) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) @@ -452,12 +482,12 @@ class NerPipeline(Pipeline): tokens = self.tokenizer.encode_plus( sentence, return_attention_mask=False, - return_tensors='tf' if is_tf_available() else 'pt', + return_tensors=self.framework, max_length=self.tokenizer.max_len ) # Forward - if is_tf_available(): + if self.framework == 'tf': entities = self.model(tokens)[0][0].numpy() else: with torch.no_grad(): @@ -549,6 +579,18 @@ class QuestionAnsweringPipeline(Pipeline): Question Answering pipeline using ModelForQuestionAnswering head. """ + def __init__(self, model, + tokenizer: Optional[PreTrainedTokenizer], + modelcard: Optional[ModelCard], + framework: Optional[str] = None, + device: int = -1, **kwargs): + super().__init__(model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + args_parser=QuestionAnsweringArgumentHandler(), + device=device, **kwargs) + @staticmethod def create_sample(question: Union[str, List[str]], context: Union[str, List[str]]) -> Union[SquadExample, List[SquadExample]]: """ @@ -567,12 +609,6 @@ class QuestionAnsweringPipeline(Pipeline): else: return SquadExample(None, question, context, None, None, None) - def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer], - modelcard: Optional[ModelCard], - device: int = -1, **kwargs): - super().__init__(model, tokenizer, modelcard, args_parser=QuestionAnsweringArgumentHandler(), - device=device, **kwargs) - def __call__(self, *texts, **kwargs): """ Args: @@ -608,7 +644,7 @@ class QuestionAnsweringPipeline(Pipeline): # Manage tensor allocation on correct device with self.device_placement(): - if is_tf_available(): + if self.framework == 'tf': fw_args = {k: tf.constant(v) for (k, v) in fw_args.items()} start, end = self.model(fw_args) start, end = start.numpy(), end.numpy() @@ -798,15 +834,10 @@ def pipeline(task: str, model: Optional = None, if task not in SUPPORTED_TASKS: raise KeyError("Unknown task {}, available tasks are {}".format(task, list(SUPPORTED_TASKS.keys()))) - pipeline_framework = 'tf' if is_tf_available() else ('pt' if is_torch_available() else None) - if pipeline_framework is None: - raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. " - "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " - "To install PyTorch, read the instructions at https://pytorch.org/.") - + framework = get_framework(model) targeted_task = SUPPORTED_TASKS[task] - task, model_class = targeted_task['impl'], targeted_task[pipeline_framework] + task, model_class = targeted_task['impl'], targeted_task[framework] # Use default model/config/tokenizer for the task if no model is provided if model is None: @@ -843,14 +874,14 @@ def pipeline(task: str, model: Optional = None, if isinstance(model, str): # Handle transparent TF/PT model conversion model_kwargs = {} - if pipeline_framework == 'pt' and model.endswith('.h5'): + if framework == 'pt' and model.endswith('.h5'): model_kwargs['from_tf'] = True logger.warning('Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. ' 'Trying to load the model with PyTorch.') - elif pipeline_framework == 'tf' and model.endswith('.bin'): + elif framework == 'tf' and model.endswith('.bin'): model_kwargs['from_pt'] = True logger.warning('Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. ' 'Trying to load the model with Tensorflow.') model = model_class.from_pretrained(model, config=config, **model_kwargs) - return task(model, tokenizer, **kwargs) + return task(model=model, tokenizer=tokenizer, modelcard=modelcard, framework=framework, **kwargs) From 825697cad4907595bbb76eb43d96962d7bd52117 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 12:51:10 +0100 Subject: [PATCH 257/302] fix tests --- transformers/pipelines.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 1c56033f7c..8b5a14fc56 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -32,7 +32,6 @@ from transformers import (AutoConfig, AutoTokenizer, PreTrainedTokenizer, PretrainedConfig, ModelCard, SquadExample, squad_convert_examples_to_features, is_tf_available, is_torch_available, BasicTokenizer, - ALL_PRETRAINED_MODEL_ARCHIVE_MAP, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP) if is_tf_available(): @@ -845,9 +844,9 @@ def pipeline(task: str, model: Optional = None, # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: - if isinstance(model, str) and model in ALL_PRETRAINED_MODEL_ARCHIVE_MAP: + if isinstance(model, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = model - elif isinstance(config, str) and model in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: + elif isinstance(config, str) and config in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP: tokenizer = config else: # Impossible to guest what is the right tokenizer here From 01ffc65e9b4e74d0399212435da2d46c6edf2563 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 13:16:23 +0100 Subject: [PATCH 258/302] update tests to remove unittest.patch --- transformers/pipelines.py | 9 ++- transformers/tests/pipelines_test.py | 91 +++++++++++++++++----------- 2 files changed, 60 insertions(+), 40 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 8b5a14fc56..efb1de92e1 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -48,16 +48,19 @@ if is_torch_available(): logger = logging.getLogger(__name__) def get_framework(model=None): + """ Select framework (TensorFlow/PyTorch) to use. + If both frameworks are installed and no specific model is provided, defaults to using TensorFlow. + """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the use supplied a model class instance. # Try to guess which framework to use from the model classname framework = 'tf' if model.__class__.__name__.startswith('TF') else 'pt' - else: - framework = 'tf' if is_tf_available() else ('pt' if is_torch_available() else None) - if framework is None: + elif not is_tf_available() and not is_torch_available(): raise ImportError("At least one of TensorFlow 2.0 or PyTorch should be installed. " "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/.") + else: + framework = 'tf' if is_tf_available() else 'pt' return framework class ArgumentHandler(ABC): diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py index a8fe668221..14bf07ee30 100644 --- a/transformers/tests/pipelines_test.py +++ b/transformers/tests/pipelines_test.py @@ -1,5 +1,4 @@ import unittest -from unittest.mock import patch from typing import Iterable @@ -35,16 +34,6 @@ TEXT_CLASSIF_FINETUNED_MODELS = { } -@require_tf -def tf_pipeline(*args, **kwargs): - return pipeline(**kwargs) - - -@require_torch -def torch_pipeline(*args, **kwargs): - return pipeline(**kwargs) - - class MonoColumnInputTestCase(unittest.TestCase): def _test_mono_column_pipeline(self, nlp, valid_inputs: list, invalid_inputs: list, output_keys: Iterable[str]): self.assertIsNotNone(nlp) @@ -72,43 +61,57 @@ class MonoColumnInputTestCase(unittest.TestCase): self.assertRaises(Exception, nlp, invalid_inputs) + @require_torch def test_ner(self): mandatory_keys = {'entity', 'word', 'score'} valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] for tokenizer, model, config in NER_FINETUNED_MODELS: - with patch('transformers.pipelines.is_torch_available', return_value=False): - nlp = tf_pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) - with patch('transformers.pipelines.is_tf_available', return_value=False): - nlp = torch_pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + @require_tf + def test_tf_ner(self): + mandatory_keys = {'entity', 'word', 'score'} + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in NER_FINETUNED_MODELS: + nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + @require_torch def test_sentiment_analysis(self): mandatory_keys = {'label'} valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: - with patch('transformers.pipelines.is_torch_available', return_value=False): - nlp = tf_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) - with patch('transformers.pipelines.is_tf_available', return_value=False): - nlp = torch_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + @require_tf + def test_tf_sentiment_analysis(self): + mandatory_keys = {'label'} + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: + nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) + @require_torch def test_features_extraction(self): valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: - with patch('transformers.pipelines.is_torch_available', return_value=False): - nlp = tf_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) + nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) - with patch('transformers.pipelines.is_tf_available', return_value=False): - nlp = torch_pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) - self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) + @require_tf + def test_tf_features_extraction(self): + valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] + invalid_inputs = [None] + for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: + nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) + self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) class MultiColumnInputTestCase(unittest.TestCase): @@ -132,6 +135,7 @@ class MultiColumnInputTestCase(unittest.TestCase): self.assertRaises(Exception, nlp, invalid_inputs[0]) self.assertRaises(Exception, nlp, invalid_inputs) + @require_torch def test_question_answering(self): mandatory_output_keys = {'score', 'answer', 'start', 'end'} valid_samples = [ @@ -149,16 +153,29 @@ class MultiColumnInputTestCase(unittest.TestCase): ] for tokenizer, model, config in QA_FINETUNED_MODELS: + nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) - # Test for Tensorflow - with patch('transformers.pipelines.is_torch_available', return_value=False): - nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) - self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) + @require_tf + def test_tf_question_answering(self): + mandatory_output_keys = {'score', 'answer', 'start', 'end'} + valid_samples = [ + {'question': 'Where was HuggingFace founded ?', 'context': 'HuggingFace was founded in Paris.'}, + { + 'question': 'In what field is HuggingFace working ?', + 'context': 'HuggingFace is a startup based in New-York founded in Paris which is trying to solve NLP.' + } + ] + invalid_samples = [ + {'question': '', 'context': 'This is a test to try empty question edge case'}, + {'question': None, 'context': 'This is a test to try empty question edge case'}, + {'question': 'What is does with empty context ?', 'context': ''}, + {'question': 'What is does with empty context ?', 'context': None}, + ] - # Test for PyTorch - with patch('transformers.pipelines.is_tf_available', return_value=False): - nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) - self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) + for tokenizer, model, config in QA_FINETUNED_MODELS: + nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) + self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) if __name__ == '__main__': From 15dda5ea32655f2ea565f8d5cd586a036399dba3 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 13:20:41 +0100 Subject: [PATCH 259/302] remove python 2 tests for circle-ci cc @aaugustin @julien-c @LysandreJik --- .circleci/config.yml | 28 ---------------------------- 1 file changed, 28 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 7a64eaba7d..b094067eb5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -44,32 +44,6 @@ jobs: - run: sudo pip install tensorboardX scikit-learn - run: python -m pytest -sv ./transformers/tests/ --cov - run: codecov - build_py2_torch: - working_directory: ~/transformers - resource_class: large - parallelism: 1 - docker: - - image: circleci/python:2.7 - steps: - - checkout - - run: sudo pip install torch - - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest codecov pytest-cov - - run: python -m pytest -sv ./transformers/tests/ --cov - - run: codecov - build_py2_tf: - working_directory: ~/transformers - resource_class: large - parallelism: 1 - docker: - - image: circleci/python:2.7 - steps: - - checkout - - run: sudo pip install tensorflow - - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest codecov pytest-cov - - run: python -m pytest -sv ./transformers/tests/ --cov - - run: codecov build_py3_custom_tokenizers: working_directory: ~/transformers docker: @@ -116,6 +90,4 @@ workflows: - build_py3_torch_and_tf - build_py3_torch - build_py3_tf - - build_py2_torch - - build_py2_tf - deploy_doc: *workflow_filters From 73fcebf7ec122e68b93f50fc770f0515502eb025 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 13:47:35 +0100 Subject: [PATCH 260/302] update serving command --- setup.py | 6 +++--- transformers-cli | 2 +- transformers/commands/serving.py | 35 +++++++++++++++++++++----------- 3 files changed, 27 insertions(+), 16 deletions(-) diff --git a/setup.py b/setup.py index 6560cc4968..4bfb774155 100644 --- a/setup.py +++ b/setup.py @@ -38,9 +38,9 @@ from setuptools import find_packages, setup extras = { - 'serving': ['uvicorn', 'fastapi'], - 'serving-tf': ['uvicorn', 'fastapi', 'tensorflow'], - 'serving-torch': ['uvicorn', 'fastapi', 'torch'] + 'serving': ['pydantic', 'uvicorn', 'fastapi'], + 'serving-tf': ['pydantic', 'uvicorn', 'fastapi', 'tensorflow'], + 'serving-torch': ['pydantic', 'uvicorn', 'fastapi', 'torch'] } extras['all'] = [package for package in extras.values()] diff --git a/transformers-cli b/transformers-cli index db2bd0e2a3..0a980a3574 100755 --- a/transformers-cli +++ b/transformers-cli @@ -3,9 +3,9 @@ from argparse import ArgumentParser from transformers.commands.download import DownloadCommand from transformers.commands.run import RunCommand -from transformers.commands.serving import ServeCommand from transformers.commands.user import UserCommands from transformers.commands.convert import ConvertCommand +from transformers.commands.serving import ServeCommand if __name__ == '__main__': parser = ArgumentParser('Transformers CLI tool', usage='transformers-cli []') diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py index a7321470ce..3c3f852809 100644 --- a/transformers/commands/serving.py +++ b/transformers/commands/serving.py @@ -1,16 +1,23 @@ from argparse import ArgumentParser, Namespace from typing import List, Optional, Union, Any -from fastapi import FastAPI, HTTPException, Body -from logging import getLogger +import logging -from pydantic import BaseModel -from uvicorn import run +try: + from uvicorn import run + from fastapi import FastAPI, HTTPException, Body + from pydantic import BaseModel + _serve_dependancies_installed = True +except (ImportError, AttributeError): + BaseModel = object + Body = lambda *x, **y: None + _serve_dependancies_installed = False from transformers import Pipeline from transformers.commands import BaseTransformersCLICommand from transformers.pipelines import SUPPORTED_TASKS, pipeline +logger = logging.getLogger('transformers-cli/serving') def serve_command_factory(args: Namespace): """ @@ -70,20 +77,24 @@ class ServeCommand(BaseTransformersCLICommand): serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int): - self._logger = getLogger('transformers-cli/serving') self._pipeline = pipeline - self._logger.info('Serving model over {}:{}'.format(host, port)) self._host = host self._port = port - self._app = FastAPI() + if not _serve_dependancies_installed: + raise ImportError("Using serve command requires FastAPI and unicorn. " + "Please install transformers with [serving]: pip install transformers[serving]." + "Or install FastAPI and unicorn separatly.") + else: + logger.info('Serving model over {}:{}'.format(host, port)) + self._app = FastAPI() - # Register routes - self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET']) - self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST']) - self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST']) - self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST']) + # Register routes + self._app.add_api_route('/', self.model_info, response_model=ServeModelInfoResult, methods=['GET']) + self._app.add_api_route('/tokenize', self.tokenize, response_model=ServeTokenizeResult, methods=['POST']) + self._app.add_api_route('/detokenize', self.detokenize, response_model=ServeDeTokenizeResult, methods=['POST']) + self._app.add_api_route('/forward', self.forward, response_model=ServeForwardResult, methods=['POST']) def run(self): run(self._app, host=self._host, port=self._port) From c37815f1300519e1a812e1080c46641db6f9f604 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 14:35:40 +0100 Subject: [PATCH 261/302] clean up PT <=> TF 2.0 conversion and config loading --- .../convert_pytorch_checkpoint_to_tf2.py | 9 +++++---- transformers/modeling_tf_utils.py | 17 ++++++++++++----- transformers/modeling_utils.py | 17 ++++++++++++----- 3 files changed, 29 insertions(+), 14 deletions(-) diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py index 4a9832f123..0edac6fb7d 100644 --- a/transformers/convert_pytorch_checkpoint_to_tf2.py +++ b/transformers/convert_pytorch_checkpoint_to_tf2.py @@ -32,7 +32,7 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model, TransfoXLConfig, TFTransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, - DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, + DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, TFDistilBertForSequenceClassification, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, AlbertConfig, TFAlbertForMaskedLM, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP) @@ -47,7 +47,7 @@ if is_torch_available(): TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + DistilBertForMaskedLM, DistilBertForQuestionAnswering, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) @@ -59,7 +59,7 @@ else: TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, - DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, + DistilBertForMaskedLM, DistilBertForSequenceClassification, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = ( @@ -70,7 +70,7 @@ else: None, None, None, None, None, None, None, - None, None, None, + None, None, None, None, None, None, None, None, None, None) @@ -93,6 +93,7 @@ MODEL_CLASSES = { 'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), + 'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP), 'albert': (AlbertConfig, TFAlbertForMaskedLM, AlbertForMaskedLM, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP), 't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP), diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py index 401ffeb67e..0aa65a9f17 100644 --- a/transformers/modeling_tf_utils.py +++ b/transformers/modeling_tf_utils.py @@ -184,7 +184,9 @@ class TFPreTrainedModel(tf.keras.Model): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) one of: + - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or + - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or @@ -236,10 +238,11 @@ class TFPreTrainedModel(tf.keras.Model): proxies = kwargs.pop('proxies', None) output_loading_info = kwargs.pop('output_loading_info', False) - # Load config - if config is None: + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( - pretrained_model_name_or_path, *model_args, + config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, @@ -310,7 +313,11 @@ class TFPreTrainedModel(tf.keras.Model): assert os.path.isfile(resolved_archive_file), "Error retrieving file {}".format(resolved_archive_file) # 'by_name' allow us to do transfer learning by skipping/adding layers # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 - model.load_weights(resolved_archive_file, by_name=True) + try: + model.load_weights(resolved_archive_file, by_name=True) + except OSError: + raise OSError("Unable to load weights from h5 file. " + "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. ") ret = model(model.dummy_inputs, training=False) # Make sure restore ops are run diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index eff54f71e1..3bc407e4a3 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -281,7 +281,9 @@ class PreTrainedModel(nn.Module): model_args: (`optional`) Sequence of positional arguments: All remaning positional arguments will be passed to the underlying model's ``__init__`` method - config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`: + config: (`optional`) one of: + - an instance of a class derived from :class:`~transformers.PretrainedConfig`, or + - a string valid as input to :func:`~transformers.PretrainedConfig.from_pretrained()` Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when: - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or @@ -336,10 +338,11 @@ class PreTrainedModel(nn.Module): proxies = kwargs.pop('proxies', None) output_loading_info = kwargs.pop('output_loading_info', False) - # Load config - if config is None: + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path config, model_kwargs = cls.config_class.from_pretrained( - pretrained_model_name_or_path, *model_args, + config_path, *model_args, cache_dir=cache_dir, return_unused_kwargs=True, force_download=force_download, resume_download=resume_download, @@ -408,7 +411,11 @@ class PreTrainedModel(nn.Module): model = cls(config, *model_args, **model_kwargs) if state_dict is None and not from_tf: - state_dict = torch.load(resolved_archive_file, map_location='cpu') + try: + state_dict = torch.load(resolved_archive_file, map_location='cpu') + except: + raise OSError("Unable to load weights from pytorch checkpoint file. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ") missing_keys = [] unexpected_keys = [] From 7f74084528b8f9fb7678b82829366f11326af62f Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 20 Dec 2019 14:47:04 +0100 Subject: [PATCH 262/302] Fix leading axis added when saving through the command run --- transformers/commands/run.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 78109b2a16..50c42d3a40 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -50,9 +50,9 @@ class RunCommand(BaseTransformersCLICommand): nlp, output = self._nlp, [] for entry in self._reader: if self._reader.is_multi_columns: - output += [nlp(**entry)] + output += nlp(**entry) else: - output += [nlp(entry)] + output += nlp(entry) # Saving data if self._nlp.binary_output: From db0795b5d0da0a9968035751b246854240a2c2ec Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 15:07:00 +0100 Subject: [PATCH 263/302] defaults models for tf and pt - update tests --- transformers/pipelines.py | 23 ++++++++++++++---- transformers/tests/pipelines_test.py | 36 ++++++++++++++++++++++++---- 2 files changed, 50 insertions(+), 9 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index efb1de92e1..33e1ab4022 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -776,7 +776,10 @@ SUPPORTED_TASKS = { 'tf': TFAutoModel if is_tf_available() else None, 'pt': AutoModel if is_torch_available() else None, 'default': { - 'model': 'distilbert-base-uncased', + 'model': { + 'pt': 'distilbert-base-uncased', + 'tf': 'distilbert-base-uncased', + }, 'config': None, 'tokenizer': 'distilbert-base-uncased' } @@ -786,7 +789,10 @@ SUPPORTED_TASKS = { 'tf': TFAutoModelForSequenceClassification if is_tf_available() else None, 'pt': AutoModelForSequenceClassification if is_torch_available() else None, 'default': { - 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', + 'model': { + 'pt': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-pytorch_model.bin', + 'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5', + }, 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json', 'tokenizer': 'distilbert-base-uncased' } @@ -796,7 +802,10 @@ SUPPORTED_TASKS = { 'tf': TFAutoModelForTokenClassification if is_tf_available() else None, 'pt': AutoModelForTokenClassification if is_torch_available() else None, 'default': { - 'model': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', + 'model': { + 'pt':'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-pytorch_model.bin', + 'tf': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5', + }, 'config': 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json', 'tokenizer': 'bert-large-cased' } @@ -806,7 +815,10 @@ SUPPORTED_TASKS = { 'tf': TFAutoModelForQuestionAnswering if is_tf_available() else None, 'pt': AutoModelForQuestionAnswering if is_torch_available() else None, 'default': { - 'model': 'distilbert-base-uncased-distilled-squad', + 'model': { + 'pt': 'distilbert-base-uncased-distilled-squad', + 'tf': 'distilbert-base-uncased-distilled-squad', + }, 'config': None, 'tokenizer': 'distilbert-base-uncased' } @@ -843,7 +855,8 @@ def pipeline(task: str, model: Optional = None, # Use default model/config/tokenizer for the task if no model is provided if model is None: - model, config, tokenizer = tuple(targeted_task['default'].values()) + models, config, tokenizer = tuple(targeted_task['default'].values()) + model = models[framework] # Try to infer tokenizer from model or config name (if provided as str) if tokenizer is None: diff --git a/transformers/tests/pipelines_test.py b/transformers/tests/pipelines_test.py index 14bf07ee30..08a1507770 100644 --- a/transformers/tests/pipelines_test.py +++ b/transformers/tests/pipelines_test.py @@ -11,6 +11,20 @@ QA_FINETUNED_MODELS = { ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None) } +TF_QA_FINETUNED_MODELS = { + ('bert-base-uncased', 'bert-large-uncased-whole-word-masking-finetuned-squad', None), + ('bert-base-cased', 'bert-large-cased-whole-word-masking-finetuned-squad', None), + ('bert-base-uncased', 'distilbert-base-uncased-distilled-squad', None) +} + +TF_NER_FINETUNED_MODELS = { + ( + 'bert-base-cased', + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-tf_model.h5', + 'https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-finetuned-conll03-english-config.json' + ) +} + NER_FINETUNED_MODELS = { ( 'bert-base-cased', @@ -25,6 +39,20 @@ FEATURE_EXTRACT_FINETUNED_MODELS = { ('distilbert-base-uncased', 'distilbert-base-uncased', None) } +TF_FEATURE_EXTRACT_FINETUNED_MODELS = { + ('bert-base-cased', 'bert-base-cased', None), + # ('xlnet-base-cased', 'xlnet-base-cased', None), # Disabled for now as it crash for TF2 + ('distilbert-base-uncased', 'distilbert-base-uncased', None) +} + +TF_TEXT_CLASSIF_FINETUNED_MODELS = { + ( + 'bert-base-uncased', + 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-tf_model.h5', + 'https://s3.amazonaws.com/models.huggingface.co/bert/distilbert-base-uncased-finetuned-sst-2-english-config.json' + ) +} + TEXT_CLASSIF_FINETUNED_MODELS = { ( 'bert-base-uncased', @@ -75,7 +103,7 @@ class MonoColumnInputTestCase(unittest.TestCase): mandatory_keys = {'entity', 'word', 'score'} valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] - for tokenizer, model, config in NER_FINETUNED_MODELS: + for tokenizer, model, config in TF_NER_FINETUNED_MODELS: nlp = pipeline(task='ner', model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @@ -93,7 +121,7 @@ class MonoColumnInputTestCase(unittest.TestCase): mandatory_keys = {'label'} valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] - for tokenizer, model, config in TEXT_CLASSIF_FINETUNED_MODELS: + for tokenizer, model, config in TF_TEXT_CLASSIF_FINETUNED_MODELS: nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, mandatory_keys) @@ -109,7 +137,7 @@ class MonoColumnInputTestCase(unittest.TestCase): def test_tf_features_extraction(self): valid_inputs = ['HuggingFace is solving NLP one commit at a time.', 'HuggingFace is based in New-York & Paris'] invalid_inputs = [None] - for tokenizer, model, config in FEATURE_EXTRACT_FINETUNED_MODELS: + for tokenizer, model, config in TF_FEATURE_EXTRACT_FINETUNED_MODELS: nlp = pipeline(task='sentiment-analysis', model=model, config=config, tokenizer=tokenizer) self._test_mono_column_pipeline(nlp, valid_inputs, invalid_inputs, {}) @@ -173,7 +201,7 @@ class MultiColumnInputTestCase(unittest.TestCase): {'question': 'What is does with empty context ?', 'context': None}, ] - for tokenizer, model, config in QA_FINETUNED_MODELS: + for tokenizer, model, config in TF_QA_FINETUNED_MODELS: nlp = pipeline(task='question-answering', model=model, config=config, tokenizer=tokenizer) self._test_multicolumn_pipeline(nlp, valid_samples, invalid_samples, mandatory_output_keys) From 4e3f745ba4e754e415c184d53c874031101d263b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Louf?= Date: Fri, 20 Dec 2019 11:13:46 +0100 Subject: [PATCH 264/302] add example for Model2Model in quickstart --- docs/source/quickstart.md | 95 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 1 deletion(-) diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md index 530aff8eb0..60e2cf3fd8 100644 --- a/docs/source/quickstart.md +++ b/docs/source/quickstart.md @@ -219,4 +219,97 @@ sequence = tokenizer.decode(generated) print(sequence) ``` -The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`. \ No newline at end of file +The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`. + +### Model2Model example + +Encoder-decoder architectures require two tokenized inputs: one for the encoder and the other one for the decoder. Let's assume that we want to use `Model2Model` for generative question answering, and start by tokenizing the question and answer that will be fed to the model. + +```python +import torch +from transformers import BertTokenizer, Model2Model + +# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows +import logging +logging.basicConfig(level=logging.INFO) + +# Load pre-trained model tokenizer (vocabulary) +tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') + +# Encode the input to the encoder (the question) +question = "Who was Jim Henson?" +encoded_question = tokenizer.encode(question) + +# Encode the input to the decoder (the answer) +answer = "Jim Henson was a puppeteer" +encoded_answer = tokenizer.encode(answer) + +# Convert inputs to PyTorch tensors +question_tensor = torch.tensor([encoded_question]) +answer_tensor = torch.tensor([encoded_answer]) +``` + +Let's see how we can use `Model2Model` to get the value of the loss associated with this (question, answer) pair: + +```python +# In order to compute the loss we need to provide language model +# labels (the token ids that the model should have produced) to +# the decoder. +lm_labels = encoded_answer +labels_tensor = torch.tensor([lm_labels]) + +# Load pre-trained model (weights) +model = Model2Model.from_pretrained('bert-base-uncased') + +# Set the model in evaluation mode to deactivate the DropOut modules +# This is IMPORTANT to have reproducible results during evaluation! +model.eval() + +# If you have a GPU, put everything on cuda +question_tensor = question_tensor.to('cuda') +answer_tensor = answer_tensor.to('cuda') +labels_tensor = labels_tensor.to('cuda') +model.to('cuda') + +# Predict hidden states features for each layer +with torch.no_grad(): + # See the models docstrings for the detail of the inputs + outputs = model(question_tensor, answer_tensor, decoder_lm_labels=labels_tensor) + # Transformers models always output tuples. + # See the models docstrings for the detail of all the outputs + # In our case, the first element is the value of the LM loss + lm_loss = outputs[0] +``` + +This loss can be used to fine-tune `Model2Model` on the question answering task. Assuming that we fine-tuned the model, let us now see how to generate an answer: + +```python +# Let's re-use the previous question +question = "Who was Jim Henson?" +encoded_question = tokenizer.encode(question) +question_tensor = torch.tensor([encoded_question]) + +# This time we try to generate the answer, so we start with an empty sequence +answer = "[CLS]" +encoded_answer = tokenizer.encode(answer, add_special_tokens=False) +answer_tensor = torch.tensor([encoded_answer]) + +# Load pre-trained model (weights) +model = Model2Model.from_pretrained('fine-tuned-weights') +model.eval() + +# If you have a GPU, put everything on cuda +question_tensor = encoded_question.to('cuda') +answer_tensor = encoded_answer.to('cuda') +model.to('cuda') + +# Predict all tokens +with torch.no_grad(): + outputs = model(question_tensor, answer_tensor) + predictions = outputs[0] + +# confirm we were able to predict 'jim' +predicted_index = torch.argmax(predictions[0, -1]).item() +predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0] +assert predicted_token == 'jim' +``` From b98ff8854460a04fe076c704555705c5d5e1b6de Mon Sep 17 00:00:00 2001 From: Morgan Funtowicz Date: Fri, 20 Dec 2019 15:52:50 +0100 Subject: [PATCH 265/302] Added pipelines quick tour in README --- README.md | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/README.md b/README.md index a9d0fb3ace..1312fcc0ac 100644 --- a/README.md +++ b/README.md @@ -490,6 +490,35 @@ transformers-cli ls # List all your S3 objects. ``` +## Quick tour of pipelines + +New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model +and outputting the result in a structured object. + +You can create `Pipeline` objects for the following down-stream tasks: + - `feature-extraction`: Generates a tensor representation for the input sequence + - `ner`: Generates named entity mapping for each word in the input sequence. + - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence. + - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question + in the context. + +```python +from transformers import pipeline + +# Allocate a pipeline for sentiment-analysis +nlp = pipeline('sentiment-analysis') +nlp('We are very happy to include pipeline into the transformers repository.') +>>> {'label': 'POSITIVE', 'score': 0.99893874} + +# Allocate a pipeline for question-answering +nlp = pipeline('question-answering') +nlp({ + 'question': 'What is the name of the repository ?', + 'context': 'Pipeline have been included in the huggingface/transformers repository' +}) +>>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'} +``` + ## Migrating from pytorch-transformers to transformers Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`. From 90debb9ff2636f3f1c8256237deeca3a1ec3c7dd Mon Sep 17 00:00:00 2001 From: Dirk Groeneveld Date: Thu, 19 Dec 2019 17:03:01 -0800 Subject: [PATCH 266/302] Keep even the first of the special tokens intact while lowercasing. --- transformers/tokenization_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 59e2d05212..2635a38db0 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -642,7 +642,7 @@ class PreTrainedTokenizer(object): def lowercase_text(t): # convert non-special tokens to lowercase escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens] - pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \ + pattern = r'(' + r'|'.join(escaped_special_toks) + r')|' + \ r'(.+?)' return re.sub( pattern, From fb393ad9945f66b081f88b81b90a2974d81e9601 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 20 Dec 2019 11:29:58 -0500 Subject: [PATCH 267/302] Added test for all special tokens --- transformers/tests/tokenization_tests_commons.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index 13e7ae746a..cdec24d9f0 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -133,6 +133,14 @@ class CommonTestCases: self.assertNotEqual(len(toks), len(toks0)) # toks0 should be longer self.assertListEqual(toks, toks2) + # Check that none of the special tokens are lowercased + sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" + tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) + + for special_token in tokenizer.all_special_tokens: + print(special_token, special_token in tokenized_sequence) + assert special_token in tokenized_sequence + tokenizer = self.get_tokenizer(do_lower_case=False) added = tokenizer.add_tokens(new_toks) From 65c75fc58796b278d58b0ce2c8d2031594ef0f64 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 20 Dec 2019 11:34:16 -0500 Subject: [PATCH 268/302] Clean special tokens test --- transformers/tests/tokenization_tests_commons.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py index cdec24d9f0..c417d033dc 100644 --- a/transformers/tests/tokenization_tests_commons.py +++ b/transformers/tests/tokenization_tests_commons.py @@ -138,8 +138,7 @@ class CommonTestCases: tokenized_sequence = tokenizer.tokenize(sequence_with_special_tokens) for special_token in tokenizer.all_special_tokens: - print(special_token, special_token in tokenized_sequence) - assert special_token in tokenized_sequence + self.assertTrue(special_token in tokenized_sequence) tokenizer = self.get_tokenizer(do_lower_case=False) From 1c12ee0e55f4aec6d90c789c7b45e9adac0ed259 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 18:28:27 +0100 Subject: [PATCH 269/302] fixing xlm-roberta tokenizer max_length and automodels --- transformers/modeling_auto.py | 17 ++++++++++++++--- transformers/modeling_utils.py | 2 +- transformers/pipelines.py | 5 +++-- transformers/tokenization_utils.py | 6 +++++- transformers/tokenization_xlm_roberta.py | 16 ++++++++++------ 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py index 761b2ce324..6b49efd378 100644 --- a/transformers/modeling_auto.py +++ b/transformers/modeling_auto.py @@ -20,7 +20,7 @@ import logging from .configuration_auto import (AlbertConfig, BertConfig, CamembertConfig, CTRLConfig, DistilBertConfig, GPT2Config, OpenAIGPTConfig, RobertaConfig, - TransfoXLConfig, XLMConfig, XLNetConfig) + TransfoXLConfig, XLMConfig, XLNetConfig, XLMRobertaConfig) from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, \ BertForTokenClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP @@ -41,7 +41,8 @@ from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertF from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, \ AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP -from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForMultipleChoice, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP +from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, \ + XLMRobertaForMultipleChoice, XLMRobertaForTokenClassification, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_utils import PreTrainedModel, SequenceSummary @@ -146,6 +147,8 @@ class AutoModel(object): return AlbertModel(config) elif isinstance(config, CamembertConfig): return CamembertModel(config) + elif isinstance(config, XLMRobertaConfig): + return XLMRobertaModel(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -333,6 +336,8 @@ class AutoModelWithLMHead(object): return XLMWithLMHeadModel(config) elif isinstance(config, CTRLConfig): return CTRLLMHeadModel(config) + elif isinstance(config, XLMRobertaConfig): + return XLMRobertaForMaskedLM(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -509,6 +514,8 @@ class AutoModelForSequenceClassification(object): return XLNetForSequenceClassification(config) elif isinstance(config, XLMConfig): return XLMForSequenceClassification(config) + elif isinstance(config, XLMRobertaConfig): + return XLMRobertaForSequenceClassification(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -787,6 +794,8 @@ class AutoModelForTokenClassification: return XLNetForTokenClassification(config) elif isinstance(config, RobertaConfig): return RobertaForTokenClassification(config) + elif isinstance(config, XLMRobertaConfig): + return XLMRobertaForTokenClassification(config) raise ValueError("Unrecognized configuration class {}".format(config)) @classmethod @@ -865,6 +874,8 @@ class AutoModelForTokenClassification: return CamembertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'distilbert' in pretrained_model_name_or_path: return DistilBertForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + elif 'xlm-roberta' in pretrained_model_name_or_path: + return XLMRobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'roberta' in pretrained_model_name_or_path: return RobertaForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) elif 'bert' in pretrained_model_name_or_path: @@ -873,4 +884,4 @@ class AutoModelForTokenClassification: return XLNetForTokenClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) raise ValueError("Unrecognized model identifier in {}. Should contains one of " - "'bert', 'xlnet', 'camembert', 'distilbert', 'roberta'".format(pretrained_model_name_or_path)) + "'bert', 'xlnet', 'camembert', 'distilbert', 'xlm-roberta', 'roberta'".format(pretrained_model_name_or_path)) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index d899771603..e3f4b9f3b8 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -415,7 +415,7 @@ class PreTrainedModel(nn.Module): state_dict = torch.load(resolved_archive_file, map_location='cpu') except: raise OSError("Unable to load weights from pytorch checkpoint file. " - "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ") + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True. ") missing_keys = [] unexpected_keys = [] diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 33e1ab4022..c3756109af 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -49,7 +49,7 @@ logger = logging.getLogger(__name__) def get_framework(model=None): """ Select framework (TensorFlow/PyTorch) to use. - If both frameworks are installed and no specific model is provided, defaults to using TensorFlow. + If both frameworks are installed and no specific model is provided, defaults to using PyTorch. """ if is_tf_available() and is_torch_available() and model is not None and not isinstance(model, str): # Both framework are available but the use supplied a model class instance. @@ -60,7 +60,8 @@ def get_framework(model=None): "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " "To install PyTorch, read the instructions at https://pytorch.org/.") else: - framework = 'tf' if is_tf_available() else 'pt' + # framework = 'tf' if is_tf_available() else 'pt' + framework = 'pt' if is_torch_available() else 'tf' return framework class ArgumentHandler(ABC): diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index 2635a38db0..d77a7100ab 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -434,7 +434,11 @@ class PreTrainedTokenizer(object): init_kwargs[key] = value # Instantiate tokenizer. - tokenizer = cls(*init_inputs, **init_kwargs) + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except OSError: + OSError("Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted.") # Save inputs and kwargs for saving and re-loading with ``save_pretrained`` tokenizer.init_inputs = init_inputs diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index 4397e7b031..57a42dde5c 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -40,8 +40,12 @@ PRETRAINED_VOCAB_FILES_MAP = { } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - 'xlm-roberta-base': None, - 'xlm-roberta-large': None, + 'xlm-roberta-base': 512, + 'xlm-roberta-large': 512, + 'xlm-roberta-large-finetuned-conll02-dutch': 512, + 'xlm-roberta-large-finetuned-conll02-spanish': 512, + 'xlm-roberta-large-finetuned-conll03-english': 512, + 'xlm-roberta-large-finetuned-conll03-german': 512, } class XLMRobertaTokenizer(PreTrainedTokenizer): @@ -58,10 +62,10 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): def __init__(self, vocab_file, bos_token="", eos_token="", sep_token="", cls_token="", unk_token="", pad_token='', mask_token='', **kwargs): - super(XLMRobertaTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, - sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, - mask_token=mask_token, - **kwargs) + super(XLMRobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, + sep_token=sep_token, cls_token=cls_token, pad_token=pad_token, + mask_token=mask_token, + **kwargs) self.max_len_single_sentence = self.max_len - 2 # take into account special tokens self.max_len_sentences_pair = self.max_len - 4 # take into account special tokens self.sp_model = spm.SentencePieceProcessor() From bbaaec046c493c9bb3bb2b24f1352413c647e3ff Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 19:19:20 +0100 Subject: [PATCH 270/302] fixing CLI pipeline --- transformers/commands/run.py | 32 +++++++++++------ transformers/pipelines.py | 67 +++++++++++++++++++----------------- 2 files changed, 58 insertions(+), 41 deletions(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 50c42d3a40..c2c141734b 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -9,6 +9,9 @@ logger = logging.getLogger(__name__) # pylint: disable=invalid-name def try_infer_format_from_ext(path: str): + if not path: + return 'pipe' + for ext in PipelineDataFormat.SUPPORTED_FORMATS: if path.endswith(ext): return ext @@ -20,9 +23,16 @@ def try_infer_format_from_ext(path: str): def run_command_factory(args): - nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device) + nlp = pipeline(task=args.task, + model=args.model if args.model else None, + config=args.config, + tokenizer=args.tokenizer, + device=args.device) format = try_infer_format_from_ext(args.input) if args.format == 'infer' else args.format - reader = PipelineDataFormat.from_str(format, args.output, args.input, args.column) + reader = PipelineDataFormat.from_str(format=format, + output_path=args.output, + input_path=args.input, + column=args.column if args.column else nlp.default_input_names) return RunCommand(nlp, reader) @@ -35,24 +45,26 @@ class RunCommand(BaseTransformersCLICommand): @staticmethod def register_subcommand(parser: ArgumentParser): run_parser = parser.add_parser('run', help="Run a pipeline through the CLI") - run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') run_parser.add_argument('--task', choices=SUPPORTED_TASKS.keys(), help='Task to run') - run_parser.add_argument('--model', type=str, required=True, help='Name or path to the model to instantiate.') + run_parser.add_argument('--input', type=str, help='Path to the file to use for inference') + run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.') + run_parser.add_argument('--model', type=str, help='Name or path to the model to instantiate.') run_parser.add_argument('--config', type=str, help='Name or path to the model\'s config to instantiate.') run_parser.add_argument('--tokenizer', type=str, help='Name of the tokenizer to use. (default: same as the model name)') run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') - run_parser.add_argument('--input', type=str, help='Path to the file to use for inference') - run_parser.add_argument('--output', type=str, help='Path to the file that will be used post to write results.') + run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') run_parser.set_defaults(func=run_command_factory) def run(self): - nlp, output = self._nlp, [] + nlp, outputs = self._nlp, [] + for entry in self._reader: - if self._reader.is_multi_columns: - output += nlp(**entry) + output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) + if isinstance(output, dict): + outputs.append(output) else: - output += nlp(entry) + outputs += output # Saving data if self._nlp.binary_output: diff --git a/transformers/pipelines.py b/transformers/pipelines.py index c3756109af..01491cf2be 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -14,12 +14,14 @@ # limitations under the License. from __future__ import absolute_import, division, print_function, unicode_literals +import sys import csv import json import os import pickle import logging import six + from abc import ABC, abstractmethod from contextlib import contextmanager from itertools import groupby @@ -98,28 +100,29 @@ class PipelineDataFormat: Supported data formats currently includes: - JSON - CSV + - stdin/stdout (pipe) PipelineDataFormat also includes some utilities to work with multi-columns like mapping from datasets columns to pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. """ SUPPORTED_FORMATS = ['json', 'csv', 'pipe'] - def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): - self.output = output - self.path = input - self.column = column.split(',') if column else [''] + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + self.output_path = output_path + self.input_path = input_path + self.column = column.split(',') if column is not None else [''] self.is_multi_columns = len(self.column) > 1 if self.is_multi_columns: self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] - if output is not None: - if exists(abspath(self.output)): - raise OSError('{} already exists on disk'.format(self.output)) + if output_path is not None: + if exists(abspath(self.output_path)): + raise OSError('{} already exists on disk'.format(self.output_path)) - if input is not None: - if not exists(abspath(self.path)): - raise OSError('{} doesnt exist on disk'.format(self.path)) + if input_path is not None: + if not exists(abspath(self.input_path)): + raise OSError('{} doesnt exist on disk'.format(self.input_path)) @abstractmethod def __iter__(self): @@ -140,7 +143,7 @@ class PipelineDataFormat: :param data: data to store :return: (str) Path where the data has been saved """ - path, _ = os.path.splitext(self.output) + path, _ = os.path.splitext(self.output_path) binary_path = os.path.extsep.join((path, 'pickle')) with open(binary_path, 'wb+') as f_output: @@ -149,23 +152,23 @@ class PipelineDataFormat: return binary_path @staticmethod - def from_str(name: str, output: Optional[str], path: Optional[str], column: Optional[str]): - if name == 'json': - return JsonPipelineDataFormat(output, path, column) - elif name == 'csv': - return CsvPipelineDataFormat(output, path, column) - elif name == 'pipe': - return PipedPipelineDataFormat(output, path, column) + def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + if format == 'json': + return JsonPipelineDataFormat(output_path, input_path, column) + elif format == 'csv': + return CsvPipelineDataFormat(output_path, input_path, column) + elif format == 'pipe': + return PipedPipelineDataFormat(output_path, input_path, column) else: - raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(name)) + raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format)) class CsvPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): - super().__init__(output, input, column) + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + super().__init__(output_path, input_path, column) def __iter__(self): - with open(self.path, 'r') as f: + with open(self.input_path, 'r') as f: reader = csv.DictReader(f) for row in reader: if self.is_multi_columns: @@ -174,7 +177,7 @@ class CsvPipelineDataFormat(PipelineDataFormat): yield row[self.column[0]] def save(self, data: List[dict]): - with open(self.output, 'w') as f: + with open(self.output_path, 'w') as f: if len(data) > 0: writer = csv.DictWriter(f, list(data[0].keys())) writer.writeheader() @@ -182,10 +185,10 @@ class CsvPipelineDataFormat(PipelineDataFormat): class JsonPipelineDataFormat(PipelineDataFormat): - def __init__(self, output: Optional[str], input: Optional[str], column: Optional[str]): - super().__init__(output, input, column) + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + super().__init__(output_path, input_path, column) - with open(input, 'r') as f: + with open(input_path, 'r') as f: self._entries = json.load(f) def __iter__(self): @@ -196,7 +199,7 @@ class JsonPipelineDataFormat(PipelineDataFormat): yield entry[self.column[0]] def save(self, data: dict): - with open(self.output, 'w') as f: + with open(self.output_path, 'w') as f: json.dump(data, f) @@ -208,9 +211,7 @@ class PipedPipelineDataFormat(PipelineDataFormat): If columns are provided, then the output will be a dictionary with {column_x: value_x} """ def __iter__(self): - import sys for line in sys.stdin: - # Split for multi-columns if '\t' in line: @@ -229,7 +230,7 @@ class PipedPipelineDataFormat(PipelineDataFormat): print(data) def save_binary(self, data: Union[dict, List[dict]]) -> str: - if self.output is None: + if self.output_path is None: raise KeyError( 'When using piped input on pipeline outputting large object requires an output file path. ' 'Please provide such output path through --output argument.' @@ -294,6 +295,9 @@ class Pipeline(_ScikitCompat): nlp = NerPipeline(model='...', config='...', tokenizer='...') nlp = QuestionAnsweringPipeline(model=AutoModel.from_pretrained('...'), tokenizer='...') """ + + default_input_names = None + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, modelcard: ModelCard = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, @@ -582,6 +586,8 @@ class QuestionAnsweringPipeline(Pipeline): Question Answering pipeline using ModelForQuestionAnswering head. """ + default_input_names = 'question,context' + def __init__(self, model, tokenizer: Optional[PreTrainedTokenizer], modelcard: Optional[ModelCard], @@ -684,7 +690,6 @@ class QuestionAnsweringPipeline(Pipeline): } for s, e, score in zip(starts, ends, scores) ] - if len(answers) == 1: return answers[0] return answers From 79e4a6a25c9d9e267a7d7cdb54a97443fd4fc9c4 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 19:33:12 +0100 Subject: [PATCH 271/302] update serving API --- transformers/commands/serving.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/transformers/commands/serving.py b/transformers/commands/serving.py index 3c3f852809..4f41f797d1 100644 --- a/transformers/commands/serving.py +++ b/transformers/commands/serving.py @@ -24,7 +24,11 @@ def serve_command_factory(args: Namespace): Factory function used to instantiate serving server from provided command line arguments. :return: ServeCommand """ - nlp = pipeline(task=args.task, model=args.model, config=args.config, tokenizer=args.tokenizer, device=args.device) + nlp = pipeline(task=args.task, + model=args.model if args.model else None, + config=args.config, + tokenizer=args.tokenizer, + device=args.device) return ServeCommand(nlp, args.host, args.port) @@ -68,12 +72,12 @@ class ServeCommand(BaseTransformersCLICommand): """ serve_parser = parser.add_parser('serve', help='CLI tool to run inference requests through REST and GraphQL endpoints.') serve_parser.add_argument('--task', type=str, choices=SUPPORTED_TASKS.keys(), help='The task to run the pipeline on') - serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') serve_parser.add_argument('--host', type=str, default='localhost', help='Interface the server will listen on.') serve_parser.add_argument('--port', type=int, default=8888, help='Port the serving will listen to.') - serve_parser.add_argument('--model', type=str, required=True, help='Model\'s name or path to stored model.') + serve_parser.add_argument('--model', type=str, help='Model\'s name or path to stored model.') serve_parser.add_argument('--config', type=str, help='Model\'s config name or path to stored model.') serve_parser.add_argument('--tokenizer', type=str, help='Tokenizer name to use.') + serve_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') serve_parser.set_defaults(func=serve_command_factory) def __init__(self, pipeline: Pipeline, host: str, port: int): From 71883b6ddcd14929217a0ddf4ad627468b9ab5a8 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 19:40:23 +0100 Subject: [PATCH 272/302] update link in readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 1312fcc0ac..769b0499cb 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ Choose the right framework for every part of a model's lifetime | [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities | | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 | | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch | +| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models | | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation | | [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | @@ -496,6 +497,7 @@ New in version `v2.3`: `Pipeline` are high-level objects which automatically han and outputting the result in a structured object. You can create `Pipeline` objects for the following down-stream tasks: + - `feature-extraction`: Generates a tensor representation for the input sequence - `ner`: Generates named entity mapping for each word in the input sequence. - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence. From ceae85ad60da38cacb14eca49f752669a4fe31dc Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 19:52:24 +0100 Subject: [PATCH 273/302] fix mc loading --- transformers/pipelines.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 01491cf2be..7188526a62 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -891,6 +891,10 @@ def pipeline(task: str, model: Optional = None, if isinstance(config, str): config = AutoConfig.from_pretrained(config) + # Instantiate modelcard if needed + if isinstance(modelcard, str): + modelcard = ModelCard.from_pretrained(modelcard) + # Instantiate model if needed if isinstance(model, str): # Handle transparent TF/PT model conversion From e37ca8e11a3aa91e27ed659e6d4e01b208aa83ca Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 20:43:42 +0100 Subject: [PATCH 274/302] fix camembert and XLM-R tokenizer --- transformers/tokenization_camembert.py | 6 ++++++ transformers/tokenization_xlm_roberta.py | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/transformers/tokenization_camembert.py b/transformers/tokenization_camembert.py index b4091558e1..4c4615eb3d 100644 --- a/transformers/tokenization_camembert.py +++ b/transformers/tokenization_camembert.py @@ -22,6 +22,7 @@ from shutil import copyfile import sentencepiece as spm from transformers.tokenization_utils import PreTrainedTokenizer +from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) @@ -145,6 +146,11 @@ class CamembertTokenizer(PreTrainedTokenizer): return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + return out_string + def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py index 57a42dde5c..adbc8cd6c7 100644 --- a/transformers/tokenization_xlm_roberta.py +++ b/transformers/tokenization_xlm_roberta.py @@ -22,6 +22,7 @@ from shutil import copyfile import sentencepiece as spm from transformers.tokenization_utils import PreTrainedTokenizer +from .tokenization_xlnet import SPIECE_UNDERLINE logger = logging.getLogger(__name__) @@ -161,6 +162,11 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): return self.fairseq_ids_to_tokens[index] return self.sp_model.IdToPiece(index - self.fairseq_offset) + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip() + return out_string + def save_vocabulary(self, save_directory): """ Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory. From a241011057245211975b4730170815536527d79d Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 20:43:48 +0100 Subject: [PATCH 275/302] fix pipeline NER --- transformers/pipelines.py | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 7188526a62..f7900feaf3 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -463,7 +463,7 @@ class NerPipeline(Pipeline): def __init__(self, model, tokenizer: PreTrainedTokenizer = None, modelcard: ModelCard = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, - binary_output: bool = False): + binary_output: bool = False, ignore_labels=['O']): super().__init__(model=model, tokenizer=tokenizer, modelcard=modelcard, @@ -473,17 +473,12 @@ class NerPipeline(Pipeline): binary_output=binary_output) self._basic_tokenizer = BasicTokenizer(do_lower_case=False) + self.ignore_labels = ignore_labels def __call__(self, *texts, **kwargs): inputs, answers = self._args_parser(*texts, **kwargs), [] for sentence in inputs: - # Ugly token to word idx mapping (for now) - token_to_word, words = [], self._basic_tokenizer.tokenize(sentence) - for i, w in enumerate(words): - tokens = self.tokenizer.tokenize(w) - token_to_word += [i] * len(tokens) - # Manage correct placement of the tensors with self.device_placement(): @@ -500,26 +495,22 @@ class NerPipeline(Pipeline): with torch.no_grad(): entities = self.model(**tokens)[0][0].cpu().numpy() - # Normalize scores - answer, token_start = [], 1 - for idx, word in groupby(token_to_word): + score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) + labels_idx = score.argmax(axis=-1) - # Sum log prob over token, then normalize across labels - score = np.exp(entities[token_start]) / np.exp(entities[token_start]).sum(-1, keepdims=True) - label_idx = score.argmax() - - if label_idx > 0: + answer = [] + for idx, label_idx in enumerate(labels_idx): + if self.model.config.id2label[label_idx] not in self.ignore_labels: answer += [{ - 'word': words[idx], - 'score': score[label_idx].item(), + 'word': self.tokenizer.decode(tokens['input_ids'][0][idx].cpu().tolist()), + 'score': score[idx][label_idx].item(), 'entity': self.model.config.id2label[label_idx] }] - # Update token start - token_start += len(list(word)) - # Append answers += [answer] + if len(answers) == 1: + return answers[0] return answers From f79a7dc661b9d7caee867af18a2be009478ab739 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 20:57:45 +0100 Subject: [PATCH 276/302] fix NER pipeline --- transformers/pipelines.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index f7900feaf3..86fd25c164 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -491,9 +491,11 @@ class NerPipeline(Pipeline): # Forward if self.framework == 'tf': entities = self.model(tokens)[0][0].numpy() + input_ids = tokens['input_ids'].numpy()[0] else: with torch.no_grad(): entities = self.model(**tokens)[0][0].cpu().numpy() + input_ids = tokens['input_ids'].cpu().numpy()[0] score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) @@ -502,7 +504,7 @@ class NerPipeline(Pipeline): for idx, label_idx in enumerate(labels_idx): if self.model.config.id2label[label_idx] not in self.ignore_labels: answer += [{ - 'word': self.tokenizer.decode(tokens['input_ids'][0][idx].cpu().tolist()), + 'word': self.tokenizer.decode(int(input_ids[idx])), 'score': score[idx][label_idx].item(), 'entity': self.model.config.id2label[label_idx] }] From cb6d54bfdabfb1fe566a2c303fcb9f18505d9b10 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 20 Dec 2019 15:06:28 -0500 Subject: [PATCH 277/302] Numpy compatibility for sentence piece convert to int earlier --- transformers/tokenization_utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py index d77a7100ab..eda89f22fc 100644 --- a/transformers/tokenization_utils.py +++ b/transformers/tokenization_utils.py @@ -1227,6 +1227,7 @@ class PreTrainedTokenizer(object): return self._convert_id_to_token(ids) tokens = [] for index in ids: + index = int(index) if skip_special_tokens and index in self.all_special_ids: continue if index in self.added_tokens_decoder: From 4775ec354b20196b53894ffeb9af7622a39dd4fc Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 21:47:15 +0100 Subject: [PATCH 278/302] add overwrite - fix ner decoding --- transformers/commands/run.py | 9 ++++++--- transformers/pipelines.py | 25 ++++++++++++++----------- 2 files changed, 20 insertions(+), 14 deletions(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index c2c141734b..2098d03413 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -32,7 +32,8 @@ def run_command_factory(args): reader = PipelineDataFormat.from_str(format=format, output_path=args.output, input_path=args.input, - column=args.column if args.column else nlp.default_input_names) + column=args.column if args.column else nlp.default_input_names, + overwrite=args.overwrite) return RunCommand(nlp, reader) @@ -54,6 +55,7 @@ class RunCommand(BaseTransformersCLICommand): run_parser.add_argument('--column', type=str, help='Name of the column to use as input. (For multi columns input as QA use column1,columns2)') run_parser.add_argument('--format', type=str, default='infer', choices=PipelineDataFormat.SUPPORTED_FORMATS, help='Input format to read from') run_parser.add_argument('--device', type=int, default=-1, help='Indicate the device to run onto, -1 indicates CPU, >= 0 indicates GPU (default: -1)') + run_parser.add_argument('--overwrite', action='store_true', help='Allow overwriting the output file.') run_parser.set_defaults(func=run_command_factory) def run(self): @@ -61,6 +63,7 @@ class RunCommand(BaseTransformersCLICommand): for entry in self._reader: output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) + print(output) if isinstance(output, dict): outputs.append(output) else: @@ -68,10 +71,10 @@ class RunCommand(BaseTransformersCLICommand): # Saving data if self._nlp.binary_output: - binary_path = self._reader.save_binary(output) + binary_path = self._reader.save_binary(outputs) logger.warning('Current pipeline requires output to be in binary format, saving at {}'.format(binary_path)) else: - self._reader.save(output) + self._reader.save(outputs) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 86fd25c164..876bdd0c09 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -107,7 +107,7 @@ class PipelineDataFormat: """ SUPPORTED_FORMATS = ['json', 'csv', 'pipe'] - def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): self.output_path = output_path self.input_path = input_path self.column = column.split(',') if column is not None else [''] @@ -116,7 +116,7 @@ class PipelineDataFormat: if self.is_multi_columns: self.column = [tuple(c.split('=')) if '=' in c else (c, c) for c in self.column] - if output_path is not None: + if output_path is not None and not overwrite: if exists(abspath(self.output_path)): raise OSError('{} already exists on disk'.format(self.output_path)) @@ -152,25 +152,26 @@ class PipelineDataFormat: return binary_path @staticmethod - def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): + def from_str(format: str, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): if format == 'json': - return JsonPipelineDataFormat(output_path, input_path, column) + return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == 'csv': - return CsvPipelineDataFormat(output_path, input_path, column) + return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) elif format == 'pipe': - return PipedPipelineDataFormat(output_path, input_path, column) + return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) else: raise KeyError('Unknown reader {} (Available reader are json/csv/pipe)'.format(format)) class CsvPipelineDataFormat(PipelineDataFormat): - def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): - super().__init__(output_path, input_path, column) + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): + super().__init__(output_path, input_path, column, overwrite=overwrite) def __iter__(self): with open(self.input_path, 'r') as f: reader = csv.DictReader(f) for row in reader: + print(row, self.column) if self.is_multi_columns: yield {k: row[c] for k, c in self.column} else: @@ -185,8 +186,8 @@ class CsvPipelineDataFormat(PipelineDataFormat): class JsonPipelineDataFormat(PipelineDataFormat): - def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str]): - super().__init__(output_path, input_path, column) + def __init__(self, output_path: Optional[str], input_path: Optional[str], column: Optional[str], overwrite=False): + super().__init__(output_path, input_path, column, overwrite=overwrite) with open(input_path, 'r') as f: self._entries = json.load(f) @@ -460,6 +461,8 @@ class NerPipeline(Pipeline): Named Entity Recognition pipeline using ModelForTokenClassification head. """ + default_input_names = 'sequences' + def __init__(self, model, tokenizer: PreTrainedTokenizer = None, modelcard: ModelCard = None, framework: Optional[str] = None, args_parser: ArgumentHandler = None, device: int = -1, @@ -504,7 +507,7 @@ class NerPipeline(Pipeline): for idx, label_idx in enumerate(labels_idx): if self.model.config.id2label[label_idx] not in self.ignore_labels: answer += [{ - 'word': self.tokenizer.decode(int(input_ids[idx])), + 'word': self.tokenizer.decode([int(input_ids[idx])]), 'score': score[idx][label_idx].item(), 'entity': self.model.config.id2label[label_idx] }] From e5812462fc0f81e9808ad87a818cd8af26405722 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 21:51:48 +0100 Subject: [PATCH 279/302] clean up debug and less verbose tqdm --- transformers/file_utils.py | 3 ++- transformers/pipelines.py | 1 - 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 59938868ea..032a6af63b 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -287,7 +287,8 @@ def http_get(url, temp_file, proxies=None, resume_size=0, user_agent=None): return content_length = response.headers.get('Content-Length') total = resume_size + int(content_length) if content_length is not None else None - progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading") + progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, + desc="Downloading", disable=bool(logger.level<=logging.INFO)) for chunk in response.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks progress.update(len(chunk)) diff --git a/transformers/pipelines.py b/transformers/pipelines.py index 876bdd0c09..f4bf3da685 100755 --- a/transformers/pipelines.py +++ b/transformers/pipelines.py @@ -171,7 +171,6 @@ class CsvPipelineDataFormat(PipelineDataFormat): with open(self.input_path, 'r') as f: reader = csv.DictReader(f) for row in reader: - print(row, self.column) if self.is_multi_columns: yield {k: row[c] for k, c in self.column} else: From 655fd068534b2a66b85f0bd05002e27f212ab6a0 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 20 Dec 2019 21:57:49 +0100 Subject: [PATCH 280/302] clean up --- transformers/commands/run.py | 1 - 1 file changed, 1 deletion(-) diff --git a/transformers/commands/run.py b/transformers/commands/run.py index 2098d03413..df03cee9d7 100644 --- a/transformers/commands/run.py +++ b/transformers/commands/run.py @@ -63,7 +63,6 @@ class RunCommand(BaseTransformersCLICommand): for entry in self._reader: output = nlp(**entry) if self._reader.is_multi_columns else nlp(entry) - print(output) if isinstance(output, dict): outputs.append(output) else: From a436574bfde4f75f518a107f45f987579d813ce5 Mon Sep 17 00:00:00 2001 From: Lysandre Date: Fri, 20 Dec 2019 16:22:20 -0500 Subject: [PATCH 281/302] Release: v2.3.0 --- README.md | 2 +- docs/source/conf.py | 2 +- setup.py | 2 +- transformers/__init__.py | 2 +- try.py | 0 5 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 try.py diff --git a/README.md b/README.md index 769b0499cb..416adcc1ef 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ Choose the right framework for every part of a model's lifetime | [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community | | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers | | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers | -| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | +| [Documentation][(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more | ## Installation diff --git a/docs/source/conf.py b/docs/source/conf.py index 99b7b44922..41a65eec29 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -26,7 +26,7 @@ author = u'huggingface' # The short X.Y version version = u'' # The full version, including alpha/beta/rc tags -release = u'2.2.2' +release = u'2.3.0' # -- General configuration --------------------------------------------------- diff --git a/setup.py b/setup.py index 4bfb774155..cd64a6ce90 100644 --- a/setup.py +++ b/setup.py @@ -46,7 +46,7 @@ extras['all'] = [package for package in extras.values()] setup( name="transformers", - version="2.2.2", + version="2.3.0", author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors", author_email="thomas@huggingface.co", description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch", diff --git a/transformers/__init__.py b/transformers/__init__.py index 1622c3892d..c0c0901df4 100755 --- a/transformers/__init__.py +++ b/transformers/__init__.py @@ -1,4 +1,4 @@ -__version__ = "2.2.2" +__version__ = "2.3.0" # Work around to update TensorFlow's absl.logging threshold which alters the # default Python logging output behavior when present. diff --git a/try.py b/try.py new file mode 100644 index 0000000000..e69de29bb2 From 3df1d2d144dae698b4aed085aec57a2033d608b7 Mon Sep 17 00:00:00 2001 From: Francesco Date: Tue, 17 Dec 2019 10:19:54 +0100 Subject: [PATCH 282/302] - Create the output directory (whose name is passed by the user in the "save_directory" parameter) where it will be saved encoder and decoder, if not exists. - Empty the output directory, if it contains any files or subdirectories. - Create the "encoder" directory inside "save_directory", if not exists. - Create the "decoder" directory inside "save_directory", if not exists. - Save the encoder and the decoder in the previous two directories, respectively. --- transformers/modeling_encoder_decoder.py | 31 ++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index a91c046d8f..8ae6daa690 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -166,6 +166,37 @@ class PreTrainedEncoderDecoder(nn.Module): We save the encoder' and decoder's parameters in two separate directories. """ + + # If the root output directory does not exist, create it + if not os.path.exists(save_directory): + os.mkdir(save_directory) + + # Check whether the output directory is empty or not + sub_directories = [directory for directory in os.listdir(save_directory) + if os.path.isdir(os.path.join(save_directory, directory))] + + if len(sub_directories) > 0: + if "encoder" in sub_directories and "decoder" in sub_directories: + print("WARNING: there is an older version of encoder-decoder saved in" +\ + " the output directory. The default behaviour is to overwrite them.") + + # Empty the output directory + for directory_to_remove in sub_directories: + # Remove all files into the subdirectory + files_to_remove = os.listdir(os.path.join(save_directory, directory_to_remove)) + for file_to_remove in files_to_remove: + os.remove(os.path.join(save_directory, directory_to_remove, file_to_remove)) + # Remove the subdirectory itself + os.rmdir(os.path.join(save_directory, directory_to_remove)) + + assert(len(os.listdir(save_directory)) == 0) # sanity check + + if not os.path.exists(os.path.join(save_directory, "encoder")): + os.mkdir(os.path.join(save_directory, "encoder")) + + if not os.path.exists(os.path.join(save_directory, "decoder")): + os.mkdir(os.path.join(save_directory, "decoder")) + self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) self.decoder.save_pretrained(os.path.join(save_directory, "decoder")) From a80778f40e4738071b5d01420a0328bb00cdb356 Mon Sep 17 00:00:00 2001 From: Francesco Date: Wed, 18 Dec 2019 16:05:28 +0100 Subject: [PATCH 283/302] small refactoring (only esthetic, not functional) --- transformers/modeling_encoder_decoder.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py index 8ae6daa690..c327bb9199 100644 --- a/transformers/modeling_encoder_decoder.py +++ b/transformers/modeling_encoder_decoder.py @@ -191,13 +191,14 @@ class PreTrainedEncoderDecoder(nn.Module): assert(len(os.listdir(save_directory)) == 0) # sanity check + # Create the "encoder" directory inside the output directory and save the encoder into it if not os.path.exists(os.path.join(save_directory, "encoder")): os.mkdir(os.path.join(save_directory, "encoder")) + self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) + # Create the "encoder" directory inside the output directory and save the decoder into it if not os.path.exists(os.path.join(save_directory, "decoder")): os.mkdir(os.path.join(save_directory, "decoder")) - - self.encoder.save_pretrained(os.path.join(save_directory, "encoder")) self.decoder.save_pretrained(os.path.join(save_directory, "decoder")) def forward(self, encoder_input_ids, decoder_input_ids, **kwargs): From 228f52867c92e21c6e7223eb2d6c7d9904b230e2 Mon Sep 17 00:00:00 2001 From: Dom Hudson Date: Thu, 7 Nov 2019 19:58:17 +0000 Subject: [PATCH 284/302] Bug fix: 1764 --- transformers/modeling_roberta.py | 45 +++++++++++++++------ transformers/tests/modeling_roberta_test.py | 41 +++++++++++++++++++ 2 files changed, 74 insertions(+), 12 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index fc27353d37..cf74c1e7b5 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -51,24 +51,45 @@ class RobertaEmbeddings(BertEmbeddings): padding_idx=self.padding_idx) def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): - if input_ids is not None: - input_shape = input_ids.size() - else: - input_shape = inputs_embeds.size()[:-1] - - seq_length = input_shape[1] - device = input_ids.device if input_ids is not None else inputs_embeds.device - if position_ids is None: - # Position numbers begin at padding_idx+1. Padding symbols are ignored. - # cf. fairseq's `utils.make_positions` - position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=device) - position_ids = position_ids.unsqueeze(0).expand(input_shape) + + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids(input_ids).to(input_ids.device) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids, inputs_embeds=inputs_embeds) + def create_position_ids_from_input_ids(self, x): + """ Replace non-padding symbols with their position numbers. Position numbers begin at + padding_idx+1. Padding symbols are ignored. This is modified from fairseq's + `utils.make_positions`. + + :param torch.Tensor x: + :return torch.Tensor: + """ + mask = x.ne(self.padding_idx).long() + incremental_indicies = torch.cumsum(mask, dim=1) * mask + return incremental_indicies + self.padding_idx + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ We are provided embeddings directly. We cannot infer which are padded so just generate + sequential position ids. + + :param torch.Tensor inputs_embeds: + :return torch.Tensor: + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long, + device=inputs_embeds.device) + return position_ids.unsqueeze(0) + ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_ diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 4d34a50528..121cb84148 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -25,6 +25,7 @@ if is_torch_available(): import torch from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForTokenClassification) + from transformers.modeling_roberta import RobertaEmbeddings from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP from .modeling_common_test import (CommonTestCases, ids_tensor) @@ -205,6 +206,46 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): shutil.rmtree(cache_dir) self.assertIsNotNone(model) + def test_create_position_ids_respects_padding_index(self): + """ Ensure that the default position ids only assign a sequential . This is a regression + test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is RobertaEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + model = RobertaEmbeddings(config=config) + + input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]]) + expected_positions = torch.as_tensor([[ + 0 + model.padding_idx + 1, + 1 + model.padding_idx + 1, + 2 + model.padding_idx + 1, + model.padding_idx + ]]) + + position_ids = model.create_position_ids_from_input_ids(input_ids) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + + def test_create_position_ids_from_inputs_embeds(self): + """ Ensure that the default position ids only assign a sequential . This is a regression + test for https://github.com/huggingface/transformers/issues/1761 + + The position ids should be masked with the embedding object's padding index. Therefore, the + first available non-padding position index is RobertaEmbeddings.padding_idx + 1 + """ + config = self.model_tester.prepare_config_and_inputs()[0] + model = RobertaEmbeddings(config=config) + + input_ids = torch.Tensor(1, 4, 30) + expected_positions = torch.as_tensor([[ + 0 + model.padding_idx + 1, + 1 + model.padding_idx + 1, + 2 + model.padding_idx + 1, + 3 + model.padding_idx + 1, + ]]) + position_ids = model.create_position_ids_from_inputs_embeds(input_ids) + self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) class RobertaModelIntegrationTest(unittest.TestCase): From 3e52915fa7106a739aa6f9feda9937961ce25068 Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Fri, 20 Dec 2019 19:01:27 -0500 Subject: [PATCH 285/302] [RoBERTa] Embeddings: fix dimensionality bug --- transformers/modeling_roberta.py | 3 +- transformers/tests/modeling_roberta_test.py | 31 ++++++++++++++------- 2 files changed, 22 insertions(+), 12 deletions(-) diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py index cf74c1e7b5..b188799522 100644 --- a/transformers/modeling_roberta.py +++ b/transformers/modeling_roberta.py @@ -52,7 +52,6 @@ class RobertaEmbeddings(BertEmbeddings): def forward(self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None): if position_ids is None: - if input_ids is not None: # Create the position ids from the input token ids. Any padded tokens remain padded. position_ids = self.create_position_ids_from_input_ids(input_ids).to(input_ids.device) @@ -88,7 +87,7 @@ class RobertaEmbeddings(BertEmbeddings): position_ids = torch.arange(self.padding_idx+1, sequence_length+self.padding_idx+1, dtype=torch.long, device=inputs_embeds.device) - return position_ids.unsqueeze(0) + return position_ids.unsqueeze(0).expand(input_shape) ROBERTA_START_DOCSTRING = r""" The RoBERTa model was proposed in diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 121cb84148..fe6ffe98c6 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -225,6 +225,10 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): ]]) position_ids = model.create_position_ids_from_input_ids(input_ids) + self.assertEqual( + position_ids.shape, + expected_positions.shape + ) self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) def test_create_position_ids_from_inputs_embeds(self): @@ -235,17 +239,24 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): first available non-padding position index is RobertaEmbeddings.padding_idx + 1 """ config = self.model_tester.prepare_config_and_inputs()[0] - model = RobertaEmbeddings(config=config) + embeddings = RobertaEmbeddings(config=config) - input_ids = torch.Tensor(1, 4, 30) - expected_positions = torch.as_tensor([[ - 0 + model.padding_idx + 1, - 1 + model.padding_idx + 1, - 2 + model.padding_idx + 1, - 3 + model.padding_idx + 1, - ]]) - position_ids = model.create_position_ids_from_inputs_embeds(input_ids) - self.assertTrue(torch.all(torch.eq(position_ids, expected_positions))) + inputs_embeds = torch.Tensor(2, 4, 30) + expected_single_positions = [ + 0 + embeddings.padding_idx + 1, + 1 + embeddings.padding_idx + 1, + 2 + embeddings.padding_idx + 1, + 3 + embeddings.padding_idx + 1, + ] + expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions]) + position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds) + self.assertEqual( + position_ids.shape, + expected_positions.shape + ) + self.assertTrue( + torch.all(torch.eq(position_ids, expected_positions)) + ) class RobertaModelIntegrationTest(unittest.TestCase): From ac1b449cc938bb34bc9021feff599cfd3b2376ae Mon Sep 17 00:00:00 2001 From: Julien Chaumond Date: Sat, 21 Dec 2019 00:09:01 -0500 Subject: [PATCH 286/302] [doc] move distilroberta to more appropriate place cc @lysandrejik --- docs/source/pretrained_models.rst | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst index a359990f5a..eb7b41ffc9 100644 --- a/docs/source/pretrained_models.rst +++ b/docs/source/pretrained_models.rst @@ -3,6 +3,7 @@ Pretrained models Here is the full list of the currently provided pretrained models together with a short presentation of each model. +For a list that includes community-uploaded models, refer to `https://huggingface.co/models `__. +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | Architecture | Shortcut name | Details of the model | @@ -154,6 +155,10 @@ Here is the full list of the currently provided pretrained models together with | | | | ``roberta-large`` fine-tuned on `MNLI `__. | | | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ +| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | +| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | +| | | (see `details `__) | +| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``roberta-base-openai-detector`` | | 12-layer, 768-hidden, 12-heads, 125M parameters | | | | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model. | | | | (see `details `__) | @@ -174,10 +179,6 @@ Here is the full list of the currently provided pretrained models together with | | | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint. | | | | (see `details `__) | | +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ -| | ``distilroberta-base`` | | 6-layer, 768-hidden, 12-heads, 82M parameters | -| | | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint. | -| | | (see `details `__) | -| +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+ | | ``distilbert-base-german-cased`` | | 6-layer, 768-hidden, 12-heads, 66M parameters | | | | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint. | | | | (see `details `__) | From 12726f8556152dbc6c115327646ebb33ccb2bc4f Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:58 +0100 Subject: [PATCH 287/302] Remove redundant torch.jit.trace in tests. This looks like it could be expensive, so don't run it twice. --- transformers/tests/modeling_common_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index c84162117a..c03d307e71 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -218,12 +218,11 @@ class CommonTestCases: inputs = inputs_dict['input_ids'] # Let's keep only input_ids try: - torch.jit.trace(model, inputs) + traced_gpt2 = torch.jit.trace(model, inputs) except RuntimeError: self.fail("Couldn't trace module.") try: - traced_gpt2 = torch.jit.trace(model, inputs) torch.jit.save(traced_gpt2, "traced_model.pt") except RuntimeError: self.fail("Couldn't save module.") From 478e456e8392be1356a795a354215ba7dbf03a7b Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:58 +0100 Subject: [PATCH 288/302] Use a random temp dir for writing file in tests. --- transformers/tests/modeling_common_test.py | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index c03d307e71..8bf66c3582 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -18,7 +18,7 @@ from __future__ import print_function import copy import sys -import os +import os.path import shutil import tempfile import json @@ -222,16 +222,18 @@ class CommonTestCases: except RuntimeError: self.fail("Couldn't trace module.") - try: - torch.jit.save(traced_gpt2, "traced_model.pt") - except RuntimeError: - self.fail("Couldn't save module.") + with TemporaryDirectory() as tmp_dir_name: + pt_file_name = os.path.join(tmp_dir_name, "traced_model.pt") - try: - loaded_model = torch.jit.load("traced_model.pt") - os.remove("traced_model.pt") - except ValueError: - self.fail("Couldn't load module.") + try: + torch.jit.save(traced_gpt2, pt_file_name) + except Exception: + self.fail("Couldn't save module.") + + try: + loaded_model = torch.jit.load(pt_file_name) + except Exception: + self.fail("Couldn't load module.") model.to(torch_device) model.eval() From 286d5bb6b7afb1fcca1923d431f42c716f53a290 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:58 +0100 Subject: [PATCH 289/302] Use a random temp dir for writing pruned models in tests. --- transformers/tests/modeling_common_test.py | 24 ++++++++-------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index 8bf66c3582..cf36332207 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -353,12 +353,11 @@ class CommonTestCases: heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)), -1: [0]} model.prune_heads(heads_to_prune) - directory = "pruned_model" - if not os.path.exists(directory): - os.makedirs(directory) - model.save_pretrained(directory) - model = model_class.from_pretrained(directory) - model.to(torch_device) + + with TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) with torch.no_grad(): outputs = model(**inputs_dict) @@ -367,7 +366,6 @@ class CommonTestCases: self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[-1].shape[-3], self.model_tester.num_attention_heads - 1) - shutil.rmtree(directory) def test_head_pruning_save_load_from_config_init(self): if not self.test_pruning: @@ -427,14 +425,10 @@ class CommonTestCases: self.assertEqual(attentions[2].shape[-3], self.model_tester.num_attention_heads) self.assertEqual(attentions[3].shape[-3], self.model_tester.num_attention_heads) - directory = "pruned_model" - - if not os.path.exists(directory): - os.makedirs(directory) - model.save_pretrained(directory) - model = model_class.from_pretrained(directory) - model.to(torch_device) - shutil.rmtree(directory) + with TemporaryDirectory() as temp_dir_name: + model.save_pretrained(temp_dir_name) + model = model_class.from_pretrained(temp_dir_name) + model.to(torch_device) with torch.no_grad(): outputs = model(**inputs_dict) From b67fa1a8d2302d808ecb9d95355181eaf21ee3b6 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:58 +0100 Subject: [PATCH 290/302] Download models directly to cache_dir. This allows moving the file instead of copying it, which is more reliable. Also it avoids writing large amounts of data to /tmp, which may not be large enough to accomodate it. Refs #2222. --- transformers/file_utils.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 032a6af63b..61ff1d00bc 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -10,10 +10,9 @@ import json import logging import os import six -import shutil import tempfile import fnmatch -from functools import wraps +from functools import partial, wraps from hashlib import sha256 from io import open @@ -345,14 +344,13 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag def _resumable_file_manager(): with open(incomplete_path,'a+b') as f: yield f - os.remove(incomplete_path) temp_file_manager = _resumable_file_manager if os.path.exists(incomplete_path): resume_size = os.stat(incomplete_path).st_size else: resume_size = 0 else: - temp_file_manager = tempfile.NamedTemporaryFile + temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) resume_size = 0 if etag is not None and (not os.path.exists(cache_path) or force_download): @@ -371,12 +369,9 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag # we are copying the file before closing it, so flush to avoid truncation temp_file.flush() - # shutil.copyfileobj() starts at the current position, so go to the start - temp_file.seek(0) - logger.info("copying %s to cache at %s", temp_file.name, cache_path) - with open(cache_path, 'wb') as cache_file: - shutil.copyfileobj(temp_file, cache_file) + logger.info("storing %s in cache at %s", url, cache_path) + os.rename(temp_file.name, cache_path) logger.info("creating metadata file for %s", cache_path) meta = {'url': url, 'etag': etag} @@ -387,6 +382,4 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag output_string = unicode(output_string, 'utf-8') # The beauty of python 2 meta_file.write(output_string) - logger.info("removing temp file %s", temp_file.name) - return cache_path From b670c2668426326aeffe626aabac7ee2dff3c7c2 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:58 +0100 Subject: [PATCH 291/302] Take advantage of the cache when running tests. Caching models across test cases and across runs of the test suite makes slow tests somewhat more bearable. Use gettempdir() instead of /tmp in tests. This makes it easier to change the location of the cache with semi-standard TMPDIR/TEMP/TMP environment variables. Fix #2222. --- .../tests/modeling_tf_xxx_test.py | 7 ++----- .../tests/modeling_xxx_test.py | 7 ++----- transformers/tests/modeling_albert_test.py | 7 ++----- transformers/tests/modeling_bert_test.py | 7 ++----- transformers/tests/modeling_common_test.py | 6 ++---- transformers/tests/modeling_ctrl_test.py | 7 ++----- transformers/tests/modeling_distilbert_test.py | 6 ++---- transformers/tests/modeling_gpt2_test.py | 7 ++----- transformers/tests/modeling_openai_test.py | 7 ++----- transformers/tests/modeling_roberta_test.py | 7 ++----- transformers/tests/modeling_t5_test.py | 7 ++----- transformers/tests/modeling_tf_albert_test.py | 8 ++------ transformers/tests/modeling_tf_auto_test.py | 18 +++++++++--------- transformers/tests/modeling_tf_bert_test.py | 7 ++----- transformers/tests/modeling_tf_ctrl_test.py | 7 ++----- .../tests/modeling_tf_distilbert_test.py | 6 ++---- transformers/tests/modeling_tf_gpt2_test.py | 7 ++----- .../tests/modeling_tf_openai_gpt_test.py | 7 ++----- transformers/tests/modeling_tf_roberta_test.py | 7 ++----- transformers/tests/modeling_tf_t5_test.py | 7 ++----- .../tests/modeling_tf_transfo_xl_test.py | 7 ++----- transformers/tests/modeling_tf_xlm_test.py | 7 ++----- transformers/tests/modeling_tf_xlnet_test.py | 7 ++----- transformers/tests/modeling_transfo_xl_test.py | 7 ++----- transformers/tests/modeling_xlm_test.py | 7 ++----- transformers/tests/modeling_xlnet_test.py | 7 ++----- transformers/tests/utils.py | 3 +++ 27 files changed, 62 insertions(+), 132 deletions(-) diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py index 912a4aa340..6eba932a8e 100644 --- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import XxxConfig, is_tf_available @@ -245,10 +244,8 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in ['xxx-base-uncased']: - model = TFXxxModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFXxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py index 30e614b3f2..5e22392d00 100644 --- a/templates/adding_a_new_model/tests/modeling_xxx_test.py +++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py @@ -17,13 +17,12 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): from transformers import (XxxConfig, XxxModel, XxxForMaskedLM, @@ -249,10 +248,8 @@ class XxxModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XxxModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = XxxModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py index 1911d244e7..b726fd9278 100644 --- a/transformers/tests/modeling_albert_test.py +++ b/transformers/tests/modeling_albert_test.py @@ -17,13 +17,12 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): from transformers import (AlbertConfig, AlbertModel, AlbertForMaskedLM, @@ -230,10 +229,8 @@ class AlbertModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = AlbertModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = AlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py index 0eb7bc9a14..a5adff8f68 100644 --- a/transformers/tests/modeling_bert_test.py +++ b/transformers/tests/modeling_bert_test.py @@ -17,13 +17,12 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): from transformers import (BertConfig, BertModel, BertForMaskedLM, @@ -360,10 +359,8 @@ class BertModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = BertModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = BertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py index cf36332207..2116651f4a 100644 --- a/transformers/tests/modeling_common_test.py +++ b/transformers/tests/modeling_common_test.py @@ -30,7 +30,7 @@ import logging from transformers import is_torch_available -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): import torch @@ -753,10 +753,8 @@ class CommonTestCases: [[], []]) def create_and_check_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(self.base_model_class.pretrained_model_archive_map.keys())[:1]: - model = self.base_model_class.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = self.base_model_class.from_pretrained(model_name, cache_dir=CACHE_DIR) self.parent.assertIsNotNone(model) def prepare_config_and_inputs_for_common(self): diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py index c7de49b2ab..ed0d62d1e6 100644 --- a/transformers/tests/modeling_ctrl_test.py +++ b/transformers/tests/modeling_ctrl_test.py @@ -16,7 +16,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import pdb from transformers import is_torch_available @@ -27,7 +26,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -205,10 +204,8 @@ class CTRLModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = CTRLModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = CTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py index 82f71c40da..ac6f5d248e 100644 --- a/transformers/tests/modeling_distilbert_test.py +++ b/transformers/tests/modeling_distilbert_test.py @@ -27,7 +27,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -235,10 +235,8 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester): # @slow # def test_model_from_pretrained(self): - # cache_dir = "/tmp/transformers_test/" # for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - # model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir) - # shutil.rmtree(cache_dir) + # model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) # self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py index a82e39c261..ad2ec1fd91 100644 --- a/transformers/tests/modeling_gpt2_test.py +++ b/transformers/tests/modeling_gpt2_test.py @@ -17,7 +17,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available @@ -27,7 +26,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -239,10 +238,8 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = GPT2Model.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = GPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py index 7655e432e8..1880febcae 100644 --- a/transformers/tests/modeling_openai_test.py +++ b/transformers/tests/modeling_openai_test.py @@ -17,7 +17,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available @@ -27,7 +26,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -207,10 +206,8 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = OpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py index 4d34a50528..299cbd01ad 100644 --- a/transformers/tests/modeling_roberta_test.py +++ b/transformers/tests/modeling_roberta_test.py @@ -17,7 +17,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available @@ -29,7 +28,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -199,10 +198,8 @@ class RobertaModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = RobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py index c337163375..9fd9a4b304 100644 --- a/transformers/tests/modeling_t5_test.py +++ b/transformers/tests/modeling_t5_test.py @@ -17,13 +17,12 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device if is_torch_available(): from transformers import (T5Config, T5Model, T5WithLMHeadModel) @@ -175,10 +174,8 @@ class T5ModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = T5Model.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = T5Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py index 93aeab66c2..ee71371a18 100644 --- a/transformers/tests/modeling_tf_albert_test.py +++ b/transformers/tests/modeling_tf_albert_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import AlbertConfig, is_tf_available @@ -217,12 +216,9 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['albert-base-uncased']: - model = TFAlbertModel.from_pretrained( - model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py index 7ab6eaa3d6..2ad39ddccf 100644 --- a/transformers/tests/modeling_tf_auto_test.py +++ b/transformers/tests/modeling_tf_auto_test.py @@ -46,11 +46,11 @@ class TFAutoModelTest(unittest.TestCase): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['bert-base-uncased']: - config = AutoConfig.from_pretrained(model_name, force_download=True) + config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) - model = TFAutoModel.from_pretrained(model_name, force_download=True) + model = TFAutoModel.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertModel) @@ -59,11 +59,11 @@ class TFAutoModelTest(unittest.TestCase): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['bert-base-uncased']: - config = AutoConfig.from_pretrained(model_name, force_download=True) + config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) - model = TFAutoModelWithLMHead.from_pretrained(model_name, force_download=True) + model = TFAutoModelWithLMHead.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForMaskedLM) @@ -72,11 +72,11 @@ class TFAutoModelTest(unittest.TestCase): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['bert-base-uncased']: - config = AutoConfig.from_pretrained(model_name, force_download=True) + config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) - model = TFAutoModelForSequenceClassification.from_pretrained(model_name, force_download=True) + model = TFAutoModelForSequenceClassification.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForSequenceClassification) @@ -85,17 +85,17 @@ class TFAutoModelTest(unittest.TestCase): logging.basicConfig(level=logging.INFO) # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['bert-base-uncased']: - config = AutoConfig.from_pretrained(model_name, force_download=True) + config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) - model = TFAutoModelForQuestionAnswering.from_pretrained(model_name, force_download=True) + model = TFAutoModelForQuestionAnswering.from_pretrained(model_name) self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForQuestionAnswering) def test_from_pretrained_identifier(self): logging.basicConfig(level=logging.INFO) - model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True) + model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) self.assertIsInstance(model, TFBertForMaskedLM) diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py index 20073e1ab8..abf20b1514 100644 --- a/transformers/tests/modeling_tf_bert_test.py +++ b/transformers/tests/modeling_tf_bert_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import BertConfig, is_tf_available @@ -310,11 +309,9 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" # for model_name in list(TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: for model_name in ['bert-base-uncased']: - model = TFBertModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py index 0876582e57..93b231e517 100644 --- a/transformers/tests/modeling_tf_ctrl_test.py +++ b/transformers/tests/modeling_tf_ctrl_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import CTRLConfig, is_tf_available @@ -189,10 +188,8 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFCTRLModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFCTRLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py index d9e971c2a5..f28b5c397b 100644 --- a/transformers/tests/modeling_tf_distilbert_test.py +++ b/transformers/tests/modeling_tf_distilbert_test.py @@ -20,7 +20,7 @@ import unittest from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import DistilBertConfig, is_tf_available @@ -211,10 +211,8 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester): # @slow # def test_model_from_pretrained(self): - # cache_dir = "/tmp/transformers_test/" # for model_name in list(DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - # model = DistilBertModel.from_pretrained(model_name, cache_dir=cache_dir) - # shutil.rmtree(cache_dir) + # model = DistilBertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) # self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py index 3f30b32787..90920342ba 100644 --- a/transformers/tests/modeling_tf_gpt2_test.py +++ b/transformers/tests/modeling_tf_gpt2_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import GPT2Config, is_tf_available @@ -220,10 +219,8 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFGPT2Model.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFGPT2Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py index 863dbf1bc0..065bf2acde 100644 --- a/transformers/tests/modeling_tf_openai_gpt_test.py +++ b/transformers/tests/modeling_tf_openai_gpt_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import OpenAIGPTConfig, is_tf_available @@ -219,10 +218,8 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFOpenAIGPTModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py index f4ed97c44b..93c478ae28 100644 --- a/transformers/tests/modeling_tf_roberta_test.py +++ b/transformers/tests/modeling_tf_roberta_test.py @@ -17,11 +17,10 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import RobertaConfig, is_tf_available @@ -192,10 +191,8 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFRobertaModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFRobertaModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py index b905a9875b..da9ce6f89d 100644 --- a/transformers/tests/modeling_tf_t5_test.py +++ b/transformers/tests/modeling_tf_t5_test.py @@ -17,12 +17,11 @@ from __future__ import division from __future__ import print_function import unittest -import shutil import sys from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import T5Config, is_tf_available @@ -162,10 +161,8 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in ['t5-small']: - model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFT5Model.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) if __name__ == "__main__": diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py index 746a6a1321..8225c09275 100644 --- a/transformers/tests/modeling_tf_transfo_xl_test.py +++ b/transformers/tests/modeling_tf_transfo_xl_test.py @@ -18,11 +18,10 @@ from __future__ import print_function import unittest import random -import shutil from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow from transformers import TransfoXLConfig, is_tf_available @@ -205,10 +204,8 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFTransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py index 228e436149..8b5ab6d742 100644 --- a/transformers/tests/modeling_tf_xlm_test.py +++ b/transformers/tests/modeling_tf_xlm_test.py @@ -17,7 +17,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_tf_available @@ -31,7 +30,7 @@ if is_tf_available(): from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow @require_tf @@ -252,10 +251,8 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py index eb66d92793..15fd917481 100644 --- a/transformers/tests/modeling_tf_xlnet_test.py +++ b/transformers/tests/modeling_tf_xlnet_test.py @@ -20,7 +20,6 @@ import os import unittest import json import random -import shutil from transformers import XLNetConfig, is_tf_available @@ -35,7 +34,7 @@ if is_tf_available(): from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_tf, slow +from .utils import CACHE_DIR, require_tf, slow @require_tf @@ -319,10 +318,8 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TFXLNetModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TFXLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py index f41d50a3a0..acbe95fe4a 100644 --- a/transformers/tests/modeling_transfo_xl_test.py +++ b/transformers/tests/modeling_transfo_xl_test.py @@ -18,7 +18,6 @@ from __future__ import print_function import unittest import random -import shutil from transformers import is_torch_available @@ -29,7 +28,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -208,10 +207,8 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = TransfoXLModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = TransfoXLModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py index 7cae6c848e..fcc2f4699b 100644 --- a/transformers/tests/modeling_xlm_test.py +++ b/transformers/tests/modeling_xlm_test.py @@ -17,7 +17,6 @@ from __future__ import division from __future__ import print_function import unittest -import shutil from transformers import is_torch_available @@ -28,7 +27,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -318,10 +317,8 @@ class XLMModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XLMModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py index 6d901ee699..6d218d6ef4 100644 --- a/transformers/tests/modeling_xlnet_test.py +++ b/transformers/tests/modeling_xlnet_test.py @@ -20,7 +20,6 @@ import os import unittest import json import random -import shutil from transformers import is_torch_available @@ -33,7 +32,7 @@ if is_torch_available(): from .modeling_common_test import (CommonTestCases, ids_tensor) from .configuration_common_test import ConfigTester -from .utils import require_torch, slow, torch_device +from .utils import CACHE_DIR, require_torch, slow, torch_device @require_torch @@ -385,10 +384,8 @@ class XLNetModelTest(CommonTestCases.CommonModelTester): @slow def test_model_from_pretrained(self): - cache_dir = "/tmp/transformers_test/" for model_name in list(XLNET_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XLNetModel.from_pretrained(model_name, cache_dir=cache_dir) - shutil.rmtree(cache_dir) + model = XLNetModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py index c950ad8f17..ba0e19f420 100644 --- a/transformers/tests/utils.py +++ b/transformers/tests/utils.py @@ -1,11 +1,14 @@ import os import unittest +import tempfile from distutils.util import strtobool from transformers.file_utils import _tf_available, _torch_available +CACHE_DIR = os.path.join(tempfile.gettempdir(), "transformers_test") + SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" From a4c9338b83ba612b5f5aec645f375d048d9a7647 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:59 +0100 Subject: [PATCH 292/302] Prevent parallel downloads of the same file with a lock. Since the file is written to the filesystem, a filesystem lock is the way to go here. Add a dependency on the third-party filelock library to get cross-platform functionality. --- setup.py | 1 + transformers/file_utils.py | 89 +++++++++++++++++++++----------------- 2 files changed, 50 insertions(+), 40 deletions(-) diff --git a/setup.py b/setup.py index cd64a6ce90..fe2e1526bf 100644 --- a/setup.py +++ b/setup.py @@ -59,6 +59,7 @@ setup( "tests.*", "tests"]), install_requires=['numpy', 'boto3', + 'filelock', 'requests', 'tqdm', 'regex != 2019.12.17', diff --git a/transformers/file_utils.py b/transformers/file_utils.py index 61ff1d00bc..ec925c6160 100644 --- a/transformers/file_utils.py +++ b/transformers/file_utils.py @@ -24,6 +24,8 @@ from tqdm.auto import tqdm from contextlib import contextmanager from . import __version__ +from filelock import FileLock + logger = logging.getLogger(__name__) # pylint: disable=invalid-name try: @@ -333,53 +335,60 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag # If we don't have a connection (etag is None) and can't identify the file # try to get the last downloaded one if not os.path.exists(cache_path) and etag is None: - matching_files = fnmatch.filter(os.listdir(cache_dir), filename + '.*') - matching_files = list(filter(lambda s: not s.endswith('.json'), matching_files)) + matching_files = [ + file + for file in fnmatch.filter(os.listdir(cache_dir), filename + '.*') + if not file.endswith('.json') and not file.endswith('.lock') + ] if matching_files: cache_path = os.path.join(cache_dir, matching_files[-1]) - if resume_download: - incomplete_path = cache_path + '.incomplete' - @contextmanager - def _resumable_file_manager(): - with open(incomplete_path,'a+b') as f: - yield f - temp_file_manager = _resumable_file_manager - if os.path.exists(incomplete_path): - resume_size = os.stat(incomplete_path).st_size - else: - resume_size = 0 - else: - temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) - resume_size = 0 + # Prevent parallel downloads of the same file with a lock. + lock_path = cache_path + '.lock' + with FileLock(lock_path): - if etag is not None and (not os.path.exists(cache_path) or force_download): - # Download to temporary file, then copy to cache dir once finished. - # Otherwise you get corrupt cache entries if the download gets interrupted. - with temp_file_manager() as temp_file: - logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) - - # GET file object - if url.startswith("s3://"): - if resume_download: - logger.warn('Warning: resumable downloads are not implemented for "s3://" urls') - s3_get(url, temp_file, proxies=proxies) + if resume_download: + incomplete_path = cache_path + '.incomplete' + @contextmanager + def _resumable_file_manager(): + with open(incomplete_path,'a+b') as f: + yield f + temp_file_manager = _resumable_file_manager + if os.path.exists(incomplete_path): + resume_size = os.stat(incomplete_path).st_size else: - http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) + resume_size = 0 + else: + temp_file_manager = partial(tempfile.NamedTemporaryFile, dir=cache_dir, delete=False) + resume_size = 0 - # we are copying the file before closing it, so flush to avoid truncation - temp_file.flush() + if etag is not None and (not os.path.exists(cache_path) or force_download): + # Download to temporary file, then copy to cache dir once finished. + # Otherwise you get corrupt cache entries if the download gets interrupted. + with temp_file_manager() as temp_file: + logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name) - logger.info("storing %s in cache at %s", url, cache_path) - os.rename(temp_file.name, cache_path) + # GET file object + if url.startswith("s3://"): + if resume_download: + logger.warn('Warning: resumable downloads are not implemented for "s3://" urls') + s3_get(url, temp_file, proxies=proxies) + else: + http_get(url, temp_file, proxies=proxies, resume_size=resume_size, user_agent=user_agent) - logger.info("creating metadata file for %s", cache_path) - meta = {'url': url, 'etag': etag} - meta_path = cache_path + '.json' - with open(meta_path, 'w') as meta_file: - output_string = json.dumps(meta) - if sys.version_info[0] == 2 and isinstance(output_string, str): - output_string = unicode(output_string, 'utf-8') # The beauty of python 2 - meta_file.write(output_string) + # we are copying the file before closing it, so flush to avoid truncation + temp_file.flush() + + logger.info("storing %s in cache at %s", url, cache_path) + os.rename(temp_file.name, cache_path) + + logger.info("creating metadata file for %s", cache_path) + meta = {'url': url, 'etag': etag} + meta_path = cache_path + '.json' + with open(meta_path, 'w') as meta_file: + output_string = json.dumps(meta) + if sys.version_info[0] == 2 and isinstance(output_string, str): + output_string = unicode(output_string, 'utf-8') # The beauty of python 2 + meta_file.write(output_string) return cache_path From 29cbab98f0b36a3056e2982bf968c8370bad3838 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:59 +0100 Subject: [PATCH 293/302] Parallelize tests on Circle CI. Set the number of CPUs manually based on the Circle CI resource class, or else we're getting 36 CPUs, which is far too much (perhaps that's the underlying hardware and not what Circle CI allocates to us). Don't parallelize the custom tokenizers tests because they take less than one second to run and parallelization actually makes them slower. --- .circleci/config.yml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index b094067eb5..a7496c81e6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -11,9 +11,9 @@ jobs: - run: sudo pip install torch - run: sudo pip install tensorflow - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest codecov pytest-cov + - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -sv ./transformers/tests/ --cov + - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov - run: codecov build_py3_torch: working_directory: ~/transformers @@ -25,10 +25,10 @@ jobs: - checkout - run: sudo pip install torch - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest codecov pytest-cov + - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -sv ./transformers/tests/ --cov - - run: python -m pytest -sv ./examples/ + - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov + - run: python -m pytest -n 8 -s -v ./examples/ - run: codecov build_py3_tf: working_directory: ~/transformers @@ -40,9 +40,9 @@ jobs: - checkout - run: sudo pip install tensorflow - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest codecov pytest-cov + - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -sv ./transformers/tests/ --cov + - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov - run: codecov build_py3_custom_tokenizers: working_directory: ~/transformers @@ -51,7 +51,7 @@ jobs: steps: - checkout - run: sudo pip install --progress-bar off . - - run: sudo pip install pytest + - run: sudo pip install pytest pytest-xdist - run: sudo pip install mecab-python3 - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py deploy_doc: From bb3bfa2d293589af0b3141c6f7235beba1c6bb44 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:59 +0100 Subject: [PATCH 294/302] Distribute tests from the same file to the same worker. This should prevent two issues: - hitting API rate limits for tests that hit the HF API - multiplying the cost of expensive test setups --- .circleci/config.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index a7496c81e6..f9de338fa4 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -13,7 +13,7 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov + - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov - run: codecov build_py3_torch: working_directory: ~/transformers @@ -27,8 +27,8 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov - - run: python -m pytest -n 8 -s -v ./examples/ + - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov + - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/ - run: codecov build_py3_tf: working_directory: ~/transformers @@ -42,7 +42,7 @@ jobs: - run: sudo pip install --progress-bar off . - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - - run: python -m pytest -n 8 -s -v ./transformers/tests/ --cov + - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov - run: codecov build_py3_custom_tokenizers: working_directory: ~/transformers From 80caf79d0743a354a43d2aac5ccfe58e0ac1b80a Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:59 +0100 Subject: [PATCH 295/302] Prevent excessive parallelism in PyTorch. We're already using as many processes in parallel as we have CPU cores. Furthermore, the number of core may be incorrectly calculated as 36 (we've seen this in pytest-xdist) which make compound the problem. PyTorch performance craters without this. --- .circleci/config.yml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index f9de338fa4..812817efaa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -4,6 +4,8 @@ jobs: working_directory: ~/transformers docker: - image: circleci/python:3.5 + environment: + OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: @@ -19,6 +21,8 @@ jobs: working_directory: ~/transformers docker: - image: circleci/python:3.5 + environment: + OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: @@ -34,6 +38,8 @@ jobs: working_directory: ~/transformers docker: - image: circleci/python:3.5 + environment: + OMP_NUM_THREADS: 1 resource_class: xlarge parallelism: 1 steps: From 343c094f2156962a24bf19c5fbd771d01c81caf7 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Fri, 20 Dec 2019 20:56:59 +0100 Subject: [PATCH 296/302] Run examples separately from tests. This optimizes the total run time of the Circle CI test suite. --- .circleci/config.yml | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 812817efaa..bfa3b943aa 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1,6 +1,6 @@ version: 2 jobs: - build_py3_torch_and_tf: + run_tests_py3_torch_and_tf: working_directory: ~/transformers docker: - image: circleci/python:3.5 @@ -17,7 +17,7 @@ jobs: - run: sudo pip install tensorboardX scikit-learn - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov - run: codecov - build_py3_torch: + run_tests_py3_torch: working_directory: ~/transformers docker: - image: circleci/python:3.5 @@ -32,9 +32,8 @@ jobs: - run: sudo pip install pytest codecov pytest-cov pytest-xdist - run: sudo pip install tensorboardX scikit-learn - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov - - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/ - run: codecov - build_py3_tf: + run_tests_py3_tf: working_directory: ~/transformers docker: - image: circleci/python:3.5 @@ -50,7 +49,7 @@ jobs: - run: sudo pip install tensorboardX scikit-learn - run: python -m pytest -n 8 --dist=loadfile -s -v ./transformers/tests/ --cov - run: codecov - build_py3_custom_tokenizers: + run_tests_py3_custom_tokenizers: working_directory: ~/transformers docker: - image: circleci/python:3.5 @@ -60,6 +59,21 @@ jobs: - run: sudo pip install pytest pytest-xdist - run: sudo pip install mecab-python3 - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py + run_examples_py3_torch: + working_directory: ~/transformers + docker: + - image: circleci/python:3.5 + environment: + OMP_NUM_THREADS: 1 + resource_class: xlarge + parallelism: 1 + steps: + - checkout + - run: sudo pip install torch + - run: sudo pip install --progress-bar off . + - run: sudo pip install pytest pytest-xdist + - run: sudo pip install tensorboardX scikit-learn + - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/ deploy_doc: working_directory: ~/transformers docker: @@ -72,7 +86,7 @@ jobs: - run: sudo pip install --progress-bar off -r docs/requirements.txt - run: sudo pip install --progress-bar off -r requirements.txt - run: ./.circleci/deploy.sh - repository_consistency: + check_repository_consistency: working_directory: ~/transformers docker: - image: circleci/python:3.5 @@ -91,9 +105,10 @@ workflows: version: 2 build_and_test: jobs: - - repository_consistency - - build_py3_custom_tokenizers - - build_py3_torch_and_tf - - build_py3_torch - - build_py3_tf + - check_repository_consistency + - run_examples_py3_torch + - run_tests_py3_custom_tokenizers + - run_tests_py3_torch_and_tf + - run_tests_py3_torch + - run_tests_py3_tf - deploy_doc: *workflow_filters From 767bc3ca68d7f1617e2edd31374be3e2f05b27b6 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sat, 21 Dec 2019 08:46:26 +0100 Subject: [PATCH 297/302] Fix typo in model name. This looks like a copy/paste mistake. Probably this test was never run. Refs #2250. --- transformers/tests/modeling_tf_xlm_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py index 8b5ab6d742..065d355b45 100644 --- a/transformers/tests/modeling_tf_xlm_test.py +++ b/transformers/tests/modeling_tf_xlm_test.py @@ -252,7 +252,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): for model_name in list(TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - model = XLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR) + model = TFXLMModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) From b8e924e10d283c095e5aca3f762d812d5106b105 Mon Sep 17 00:00:00 2001 From: Aymeric Augustin Date: Sat, 21 Dec 2019 08:50:15 +0100 Subject: [PATCH 298/302] Restore test. This looks like debug code accidentally committed in b18509c2. Refs #2250. --- transformers/tests/modeling_tf_albert_test.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py index ee71371a18..374417cfe2 100644 --- a/transformers/tests/modeling_tf_albert_test.py +++ b/transformers/tests/modeling_tf_albert_test.py @@ -216,8 +216,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester): @slow def test_model_from_pretrained(self): - # for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: - for model_name in ['albert-base-uncased']: + for model_name in list(TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]: model = TFAlbertModel.from_pretrained(model_name, cache_dir=CACHE_DIR) self.assertIsNotNone(model) From 8a2be93b4e9ba15e1bb4141202bf3e17ca7dcdd6 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 21 Dec 2019 13:31:28 +0100 Subject: [PATCH 299/302] fix merge --- examples/run_squad.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/examples/run_squad.py b/examples/run_squad.py index 26983a2ba7..3832b66f23 100644 --- a/examples/run_squad.py +++ b/examples/run_squad.py @@ -374,24 +374,6 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal if args.local_rank == 0 and not evaluate: torch.distributed.barrier() # Make sure only the first process in distributed training process the dataset, and the others will use the cache - # Convert to Tensors and build dataset - all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long) - all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long) - all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long) - all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long) - all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float) - if evaluate: - all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_example_index, all_cls_index, all_p_mask) - else: - all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long) - all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long) - all_is_impossible = torch.tensor([1.0 if f.is_impossible == True else 0.0 for f in features], dtype=torch.float) - dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, - all_start_positions, all_end_positions, - all_cls_index, all_p_mask, all_is_impossible) - if output_examples: return dataset, examples, features return dataset From 1c37746892a5fd680e88264346197bb313c8dd08 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 21 Dec 2019 13:52:49 +0100 Subject: [PATCH 300/302] fixing run_generation --- examples/run_generation.py | 9 ++++----- transformers/configuration_utils.py | 1 - transformers/modeling_utils.py | 9 +++++---- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/examples/run_generation.py b/examples/run_generation.py index 8121f4f5aa..67e1da7413 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -156,7 +156,7 @@ def main(): parser.add_argument("--length", type=int, default=20) parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") - parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 0 implies greedy sampling") + parser.add_argument("--temperature", type=float, default=1.0, help="temperature of 1.0 has no effect, lower tend toward greedy sampling") parser.add_argument("--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2") parser.add_argument("--k", type=int, default=0) parser.add_argument("--p", type=float, default=0.9) @@ -187,7 +187,6 @@ def main(): tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) model.to(args.device) - model.eval() args.length = adjust_length_to_model( args.length, max_sequence_length=model.config.max_position_embeddings @@ -202,11 +201,11 @@ def main(): if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) prompt_text, model_kwargs = prepare_input(args, model, tokenizer, prompt_text) - encoded_prompt = torch.tensor(tokenizer.encode(prompt_text, add_special_tokens=False)).unsqueeze(0) + encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt') output_sequences = model.generate( - intput_ids=encoded_prompt, - length=args.length, + input_ids=encoded_prompt, + max_length=args.length, temperature=args.temperature, top_k=args.k, top_p=args.p, diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py index 456af3341c..ceb032a57c 100644 --- a/transformers/configuration_utils.py +++ b/transformers/configuration_utils.py @@ -72,7 +72,6 @@ class PretrainedConfig(object): self.bos_token_id = kwargs.pop('bos_token_id', 0) self.pad_token_id = kwargs.pop('pad_token_id', 0) self.eos_token_ids = kwargs.pop('eos_token_ids', 0) - self.batch_size = kwargs.pop('batch_size', 1) self.length_penalty = kwargs.pop('length_penalty', 1.) self.num_return_sequences = kwargs.pop('num_return_sequences', 1) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index f55c209ac0..5b28d5b755 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -485,9 +485,10 @@ class PreTrainedModel(nn.Module): def prepare_inputs_for_generation(self, input_ids, **kwargs): return {"input_ids": input_ids} + @torch.no_grad() def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, - bos_token_id=None, pad_token_id=None, eos_token_ids=None, batch_size=None, + bos_token_id=None, pad_token_id=None, eos_token_ids=None, length_penalty=None, num_return_sequences=None, **model_kwargs): """ Sequence generator for models with a LM head. @@ -530,19 +531,20 @@ class PreTrainedModel(nn.Module): bos_token_id = bos_token_id if bos_token_id is not None else self.config.bos_token_id pad_token_id = pad_token_id if pad_token_id is not None else self.config.pad_token_id eos_token_ids = eos_token_ids if eos_token_ids is not None else self.config.eos_token_ids - batch_size = batch_size if batch_size is not None else self.config.batch_size length_penalty = length_penalty if length_penalty is not None else self.config.length_penalty num_return_sequences = num_return_sequences if num_return_sequences is not None else self.config.num_return_sequences if input_ids is not None: batch_size = input_ids.shape[0] # overriden by the input batch_size + else: + batch_size = 1 if isinstance(eos_token_ids, int): eos_token_ids = [eos_token_ids] assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." - assert temperature > 0, "`temperature` should be strictely positive." + # assert temperature > 0, "`temperature` should be strictely positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." @@ -550,7 +552,6 @@ class PreTrainedModel(nn.Module): assert isinstance(pad_token_id, int) and pad_token_id >= 0, "`pad_token_id` should be a positive integer." assert isinstance(eos_token_ids, (list, tuple)) and (e >= 0 for e in eos_token_ids), \ "`eos_token_ids` should be a positive integer or a list/tuple of positive integers." - assert isinstance(batch_size, int) and batch_size > 0, "`batch_size` should be a strictely positive integer." assert length_penalty > 0, "`length_penalty` should be strictely positive." assert isinstance(num_return_sequences, int) and num_return_sequences > 0, "`num_return_sequences` should be a strictely positive integer." From 300ec3003c282c5e3f06b33509af10dd0336d0ba Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 21 Dec 2019 14:02:19 +0100 Subject: [PATCH 301/302] fixing run_generation example - using torch.no_grad --- examples/run_generation.py | 31 ++++++++++++++----------------- transformers/configuration_xlm.py | 4 ++-- transformers/modeling_utils.py | 29 +++++++++++++---------------- transformers/modeling_xlm.py | 6 +++--- 4 files changed, 32 insertions(+), 38 deletions(-) diff --git a/examples/run_generation.py b/examples/run_generation.py index 67e1da7413..ade85f0269 100644 --- a/examples/run_generation.py +++ b/examples/run_generation.py @@ -87,11 +87,11 @@ def prepare_ctrl_input(args, _, tokenizer, prompt_text): logger.info( "WARNING! You are not starting your generation from a control code so you won't get good results" ) - return prompt_text, {} + return prompt_text def prepare_xlm_input(args, model, tokenizer, prompt_text): - kwargs = {"language": None, "mask_token_id": None} + # kwargs = {"language": None, "mask_token_id": None} # Set the language use_lang_emb = hasattr(model.config, "use_lang_emb") and model.config.use_lang_emb @@ -107,14 +107,15 @@ def prepare_xlm_input(args, model, tokenizer, prompt_text): + str(list(available_languages)) + " >>> " ) - kwargs["language"] = tokenizer.lang2id[language] + # kwargs["language"] = tokenizer.lang2id[language] + # TODO fix mask_token_id setup when configurations will be synchronized between models and tokenizers # XLM masked-language modeling (MLM) models need masked token - is_xlm_mlm = "mlm" in args.model_name_or_path - if is_xlm_mlm: - kwargs["mask_token_id"] = tokenizer.mask_token_id + # is_xlm_mlm = "mlm" in args.model_name_or_path + # if is_xlm_mlm: + # kwargs["mask_token_id"] = tokenizer.mask_token_id - return prompt_text, kwargs + return prompt_text def prepare_xlnet_input(args, _, tokenizer, prompt_text): @@ -179,8 +180,8 @@ def main(): try: args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] - except KeyError as ke: - raise ke( + except KeyError: + raise KeyError( "the model {} you specified is not supported. You are welcome to add it and open a PR :)" ) @@ -197,10 +198,9 @@ def main(): # Different models need different input formatting and/or extra arguments requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys() - model_kwargs = {} if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) - prompt_text, model_kwargs = prepare_input(args, model, tokenizer, prompt_text) + prompt_text = prepare_input(args, model, tokenizer, prompt_text) encoded_prompt = tokenizer.encode(prompt_text, add_special_tokens=False, return_tensors='pt') output_sequences = model.generate( @@ -210,14 +210,11 @@ def main(): top_k=args.k, top_p=args.p, repetition_penalty=args.repetition_penalty, - **model_kwargs, ) - generated_sequence = output_sequences.tolist()[ - encoded_prompt.size(1) : - ] # adapted to case where num_samples > 1 - text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) - text = text[: text.find(args.stop_token) if args.stop_token else None] + generated_sequence = output_sequences.tolist() + text = [tokenizer.decode(seq, clean_up_tokenization_spaces=True) for seq in generated_sequence] + # text = text[: text.find(args.stop_token) if args.stop_token else None] print(text) diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py index 1938b85741..1134c7ab61 100644 --- a/transformers/configuration_xlm.py +++ b/transformers/configuration_xlm.py @@ -113,8 +113,8 @@ class XLMConfig(PretrainedConfig): summary_first_dropout=0.1, start_n_top=5, end_n_top=5, - mask_token_id = 0, - lang_id = 0, + mask_token_id=0, + lang_id=0, **kwargs): """Constructs XLMConfig. """ diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 5b28d5b755..005252c141 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -489,7 +489,7 @@ class PreTrainedModel(nn.Module): def generate(self, input_ids=None, max_length=None, do_sample=None, num_beams=None, temperature=None, top_k=None, top_p=None, repetition_penalty=None, bos_token_id=None, pad_token_id=None, eos_token_ids=None, - length_penalty=None, num_return_sequences=None, **model_kwargs): + length_penalty=None, num_return_sequences=None): """ Sequence generator for models with a LM head. The method currently supports greedy or penalized greedy decoding, sampling with top-k or nucleus sampling @@ -519,7 +519,8 @@ class PreTrainedModel(nn.Module): # We cannot generate if the model does not have a LM head if self.get_output_embeddings() is None: - raise AttributeError("You tried do generated sequences with a model that does not have a LM Head.") + raise AttributeError("You tried to generate sequences with a model that does not have a LM Head." + "Please use another model class (e.g. `OpenAIGPTLMHeadModel`)") max_length = max_length if max_length is not None else self.config.max_length do_sample = do_sample if do_sample is not None else self.config.do_sample @@ -544,7 +545,7 @@ class PreTrainedModel(nn.Module): assert isinstance(max_length, int) and max_length > 0, "`max_length` should be a strictely positive integer." assert isinstance(do_sample, bool), "`do_sample` should be a boolean." assert isinstance(num_beams, int) and num_beams > 0, "`num_beams` should be a strictely positive integer." - # assert temperature > 0, "`temperature` should be strictely positive." + # assert temperature >= 0, "`temperature` should be positive." assert isinstance(top_k, int) and top_k >= 0, "`top_k` should be a positive integer." assert 0 <= top_p <= 1, "`top_p` should be between 0 and 1." assert repetition_penalty >= 1.0, "`repetition_penalty` should be >= 1." @@ -576,13 +577,11 @@ class PreTrainedModel(nn.Module): output = self._generate_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, pad_token_id, eos_token_ids, effective_batch_size, - length_penalty, num_beams, vocab_size, - **model_kwargs) + length_penalty, num_beams, vocab_size) else: output = self._generate_no_beam_search(input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, effective_batch_size, - **model_kwargs) + pad_token_id, eos_token_ids, effective_batch_size) if num_return_sequences != 1: output = output.view(batch_size, num_return_sequences, -1) @@ -590,19 +589,18 @@ class PreTrainedModel(nn.Module): def _generate_no_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, - pad_token_id, eos_token_ids, batch_size, - **model_kwargs): + pad_token_id, eos_token_ids, batch_size): """ Generate sequences for each example without beam search (num_beams == 1). All returned sequence are generated independantly. """ # current position / max lengths / length of generated sentences / unfinished sentences unfinished_sents = input_ids.new(batch_size).fill_(1) - # cache compute states + # TODO: add cached compute states pasts = None while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts, **model_kwargs) + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) outputs = self(**model_inputs) next_token_logits = outputs[0][:, -1, :] @@ -614,7 +612,7 @@ class PreTrainedModel(nn.Module): if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: + if temperature > 0 and temperature != 1.0: next_token_logits = next_token_logits / temperature # Top-p/top-k filtering next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p) @@ -644,8 +642,7 @@ class PreTrainedModel(nn.Module): def _generate_beam_search(self, input_ids, cur_len, max_length, do_sample, temperature, top_k, top_p, repetition_penalty, pad_token_id, eos_token_ids, batch_size, - length_penalty, num_beams, vocab_size, - **model_kwargs): + length_penalty, num_beams, vocab_size): """ Generate sequences for each example with beam search. """ # Expand input to num beams @@ -667,7 +664,7 @@ class PreTrainedModel(nn.Module): done = [False for _ in range(batch_size)] while cur_len < max_length: - model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts, **model_kwargs) + model_inputs = self.prepare_inputs_for_generation(input_ids, pasts=pasts) scores = self(**model_inputs)[0] # (batch_size * num_beams, cur_len, vocab_size) scores = scores[:, -1, :] # (batch_size * num_beams, vocab_size) @@ -679,7 +676,7 @@ class PreTrainedModel(nn.Module): if do_sample: # Temperature (higher temperature => more likely to sample low probability tokens) - if temperature != 1.0: + if temperature > 0 and temperature != 1.0: scores = scores / temperature # Top-p/top-k filtering scores = top_k_top_p_filtering(scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2) # (batch_size * num_beams, vocab_size) diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py index 6691b0f60b..35bada92af 100644 --- a/transformers/modeling_xlm.py +++ b/transformers/modeling_xlm.py @@ -639,9 +639,9 @@ class XLMWithLMHeadModel(XLMPreTrainedModel): def get_output_embeddings(self): return self.pred_layer.proj - def prepare_inputs_for_generation(self, input_ids, **model_kwargs): - mask_token_id = model_kwargs['mask_token_id'] if 'mask_token_id' in model_kwargs else self.config.mask_token_id - lang_id = model_kwargs['lang_id'] if 'lang_id' in model_kwargs else self.config.lang_id + def prepare_inputs_for_generation(self, input_ids, **kwargs): + mask_token_id = self.config.mask_token_id + lang_id = self.config.lang_id mask_token = torch.full((1, 1), mask_token_id, dtype=torch.long, device=input_ids.device) input_ids = torch.cat([input_ids, mask_token], dim=1) From f86ed2318917edc9aa8e21b97f292fd623ad5273 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Sat, 21 Dec 2019 14:13:06 +0100 Subject: [PATCH 302/302] update doc --- transformers/modeling_utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py index 25124f1fda..05e5ed3573 100644 --- a/transformers/modeling_utils.py +++ b/transformers/modeling_utils.py @@ -529,6 +529,16 @@ class PreTrainedModel(nn.Module): The cumulative probability of parameter highest probability vocabulary tokens to keep for nucleus sampling. Must be between 0 and 1. Default to 1. **repetition_penalty**: (`optional`) float The parameter for repetition penalty. Between 1.0 and + infinity. 1.0 means no penalty. Default to 1. + **bos_token_id**: (`optional`) int + Beginning of sentence token if no prompt is provided. Default to 0. + **eos_token_ids**: (`optional`) int or list of int + End of sequence token or list of tokens to stop the generation. Default to 0. + **length_penalty**: (`optional`) int + Exponential penalty to the length. Default to 0. + **length_penalty**: (`optional`) float + Exponential penalty to the length. Default to 1. + **num_return_sequences**: (`optional`) int + The number of independantly computed returned sequences for each element in the batch. Default to 1. """ # We cannot generate if the model does not have a LM head