commit
6b763d04a9
|
@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
|
|||
from .tokenization_xlm import XLMTokenizer
|
||||
from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
|
||||
|
||||
from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
BertForSequenceClassification, BertForMultipleChoice,
|
||||
BertForTokenClassification, BertForQuestionAnswering,
|
||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
|
||||
from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
|
||||
BertForMaskedLM, BertForNextSentencePrediction,
|
||||
BertForSequenceClassification, BertForMultipleChoice,
|
||||
BertForTokenClassification, BertForQuestionAnswering,
|
||||
load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
|
||||
BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
|
||||
from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
|
||||
OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
|
||||
load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
|
||||
from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
|
||||
load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_gpt2 import (GPT2Config, GPT2Model,
|
||||
from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
|
||||
GPT2LMHeadModel, GPT2DoubleHeadsModel,
|
||||
load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig,
|
|||
XLNetForSequenceClassification, XLNetForQuestionAnswering,
|
||||
load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
from .modeling_xlm import (XLMConfig, XLMModel,
|
||||
from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
|
||||
XLMWithLMHeadModel, XLMForSequenceClassification,
|
||||
XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
|
||||
|
|
|
@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
|
|||
s3_models = list(cls.max_model_input_sizes.keys())
|
||||
vocab_files = {}
|
||||
if pretrained_model_name_or_path in s3_models:
|
||||
# Get the vocabulary from AWS S3 bucket
|
||||
for file_id, map_list in cls.pretrained_vocab_files_map.items():
|
||||
vocab_files[file_id] = map_list[pretrained_model_name_or_path]
|
||||
else:
|
||||
# Get the vocabulary from local files
|
||||
logger.info(
|
||||
"Model name '{}' not found in model shortcut name list ({}). "
|
||||
"Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
|
||||
pretrained_model_name_or_path, ', '.join(s3_models),
|
||||
pretrained_model_name_or_path))
|
||||
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
|
||||
all_vocab_files_names.update(cls.vocab_files_names)
|
||||
for file_id, file_name in all_vocab_files_names.items():
|
||||
|
||||
# Look for the tokenizer main vocabulary files
|
||||
for file_id, file_name in cls.vocab_files_names.items():
|
||||
if os.path.isdir(pretrained_model_name_or_path):
|
||||
# If a directory is provided we look for the standard filenames
|
||||
full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
|
||||
else:
|
||||
# If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
|
||||
full_file_name = pretrained_model_name_or_path
|
||||
if not os.path.exists(full_file_name):
|
||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||
full_file_name = None
|
||||
vocab_files[file_id] = full_file_name
|
||||
|
||||
# Look for the additional tokens files
|
||||
all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
|
||||
'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
|
||||
|
||||
# If a path to a file was provided, get the parent directory
|
||||
saved_directory = pretrained_model_name_or_path
|
||||
if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
|
||||
saved_directory = os.path.dirname(saved_directory)
|
||||
|
||||
for file_id, file_name in all_vocab_files_names.items():
|
||||
full_file_name = os.path.join(saved_directory, file_name)
|
||||
if not os.path.exists(full_file_name):
|
||||
logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
|
||||
full_file_name = None
|
||||
vocab_files[file_id] = full_file_name
|
||||
|
||||
if all(full_file_name is None for full_file_name in vocab_files.values()):
|
||||
logger.error(
|
||||
"Model name '{}' was not found in model name list ({}). "
|
||||
|
|
Loading…
Reference in New Issue