From 8b0aa677963538dad24667783df9344650a59c18 Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets Date: Tue, 21 May 2024 10:05:23 +0200 Subject: [PATCH] adding user defined tokens #30824 --- src/transformers/convert_slow_tokenizer.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/transformers/convert_slow_tokenizer.py b/src/transformers/convert_slow_tokenizer.py index b1d36dbc55..b2a5b500ff 100644 --- a/src/transformers/convert_slow_tokenizer.py +++ b/src/transformers/convert_slow_tokenizer.py @@ -1397,6 +1397,10 @@ class LlamaConverter(SpmConverter): AddedToken(self.original_tokenizer.convert_ids_to_tokens(2), normalized=False, special=True), ] ) + user_defined_symbols = [ + AddedToken(token, normalized=True, special=False) for token in proto.trainer_spec.user_defined_symbols + ] + tokenizer.add_tokens(user_defined_symbols) else: raise Exception( "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"