[tokenizer] sanitize saved config (#21483)

* [tokenizer] sanitize saved config

* rm config["name_or_path"] test
This commit is contained in:
Stas Bekman 2023-02-07 10:51:45 -08:00 committed by GitHub
parent 67d074874d
commit b9af152efb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 4 additions and 2 deletions

View File

@ -2153,6 +2153,10 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
if self._auto_class is not None:
custom_object_save(self, save_directory, config=tokenizer_config)
# remove private information
if "name_or_path" in tokenizer_config:
tokenizer_config.pop("name_or_path")
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)

View File

@ -230,8 +230,6 @@ class AutoTokenizerTest(unittest.TestCase):
# Check the class of the tokenizer was properly saved (note that it always saves the slow class).
self.assertEqual(config["tokenizer_class"], "BertTokenizer")
# Check other keys just to make sure the config was properly saved /reloaded.
self.assertEqual(config["name_or_path"], SMALL_MODEL_IDENTIFIER)
def test_new_tokenizer_registration(self):
try: