adding user defined tokens #30824

This commit is contained in:
Ita Zaporozhets 2024-05-21 10:05:23 +02:00
parent c876d12127
commit 996ff224a3
1 changed files with 4 additions and 0 deletions

View File

@ -1385,6 +1385,10 @@ class LlamaConverter(SpmConverter):
AddedToken(self.original_tokenizer.convert_ids_to_tokens(2), normalized=False, special=True),
]
)
user_defined_symbols = [
AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols
]
tokenizer.add_tokens(user_defined_symbols)
else:
raise Exception(
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"