From 632d711411d2126e90cd4657f411a09bc180f561 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 26 Jul 2019 21:14:37 +0200 Subject: [PATCH 1/2] fix #908 --- pytorch_transformers/__init__.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py index f875e4ab18..b4b957192c 100644 --- a/pytorch_transformers/__init__.py +++ b/pytorch_transformers/__init__.py @@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE from .tokenization_xlm import XLMTokenizer from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization) -from .modeling_bert import (BertConfig, BertModel, BertForPreTraining, - BertForMaskedLM, BertForNextSentencePrediction, - BertForSequenceClassification, BertForMultipleChoice, - BertForTokenClassification, BertForQuestionAnswering, - load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, - BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) -from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, +from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining, + BertForMaskedLM, BertForNextSentencePrediction, + BertForSequenceClassification, BertForMultipleChoice, + BertForTokenClassification, BertForQuestionAnswering, + load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, + BERT_PRETRAINED_CONFIG_ARCHIVE_MAP) +from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel, OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, +from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel, load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_gpt2 import (GPT2Config, GPT2Model, +from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model, GPT2LMHeadModel, GPT2DoubleHeadsModel, load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP) @@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig, XLNetForSequenceClassification, XLNetForQuestionAnswering, load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP) -from .modeling_xlm import (XLMConfig, XLMModel, +from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP, XLM_PRETRAINED_MODEL_ARCHIVE_MAP) From 7b6e474c9acc26962363e78ef95fdb6f006eb0b4 Mon Sep 17 00:00:00 2001 From: thomwolf Date: Fri, 26 Jul 2019 21:26:44 +0200 Subject: [PATCH 2/2] fix #901 --- pytorch_transformers/tokenization_utils.py | 28 ++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py index 858edc7c50..e2fe46320e 100644 --- a/pytorch_transformers/tokenization_utils.py +++ b/pytorch_transformers/tokenization_utils.py @@ -160,26 +160,46 @@ class PreTrainedTokenizer(object): s3_models = list(cls.max_model_input_sizes.keys()) vocab_files = {} if pretrained_model_name_or_path in s3_models: + # Get the vocabulary from AWS S3 bucket for file_id, map_list in cls.pretrained_vocab_files_map.items(): vocab_files[file_id] = map_list[pretrained_model_name_or_path] else: + # Get the vocabulary from local files logger.info( "Model name '{}' not found in model shortcut name list ({}). " "Assuming '{}' is a path or url to a directory containing tokenizer files.".format( pretrained_model_name_or_path, ', '.join(s3_models), pretrained_model_name_or_path)) - all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, - 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} - all_vocab_files_names.update(cls.vocab_files_names) - for file_id, file_name in all_vocab_files_names.items(): + + # Look for the tokenizer main vocabulary files + for file_id, file_name in cls.vocab_files_names.items(): if os.path.isdir(pretrained_model_name_or_path): + # If a directory is provided we look for the standard filenames full_file_name = os.path.join(pretrained_model_name_or_path, file_name) else: + # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file) full_file_name = pretrained_model_name_or_path if not os.path.exists(full_file_name): logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) full_file_name = None vocab_files[file_id] = full_file_name + + # Look for the additional tokens files + all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE, + 'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE} + + # If a path to a file was provided, get the parent directory + saved_directory = pretrained_model_name_or_path + if os.path.exists(saved_directory) and not os.path.isdir(saved_directory): + saved_directory = os.path.dirname(saved_directory) + + for file_id, file_name in all_vocab_files_names.items(): + full_file_name = os.path.join(saved_directory, file_name) + if not os.path.exists(full_file_name): + logger.info("Didn't find file {}. We won't load it.".format(full_file_name)) + full_file_name = None + vocab_files[file_id] = full_file_name + if all(full_file_name is None for full_file_name in vocab_files.values()): logger.error( "Model name '{}' was not found in model name list ({}). "