Compare commits

...

12 Commits

Author SHA1 Message Date
Ita Zaporozhets 74e78f1720 fix 2024-05-31 17:15:56 +02:00
Ita Zaporozhets d92822e045 edit 2024-05-31 17:05:48 +02:00
Ita Zaporozhets c416522a76 draft 2024-05-31 16:57:32 +02:00
Ita Zaporozhets 84143a2cc3 readd cases 2024-05-31 14:13:37 +02:00
Ita Zaporozhets ff5974bb61 utils update 2024-05-31 14:01:04 +02:00
Ita Zaporozhets fdb63e21db t5 2024-05-31 13:50:52 +02:00
Ita Zaporozhets 79ce5bb67f add t5 2024-05-31 13:15:59 +02:00
Ita Zaporozhets 896b7d152e draft pr 2024-05-31 13:15:59 +02:00
Ita Zaporozhets 7afb15921d more general approach 2024-05-31 13:15:59 +02:00
Ita Zaporozhets 31fbe4f12c add comment 2024-05-31 13:15:59 +02:00
Ita Zaporozhets d1ea757c21 add user defined symbols to all tokenizers from SpmConverter 2024-05-31 13:15:59 +02:00
Ita Zaporozhets 24ea0cd756 adding user defined tokens #30824 2024-05-31 13:15:59 +02:00
7 changed files with 133 additions and 14 deletions

View File

@ -620,14 +620,26 @@ class SpmConverter(Converter):
def converted(self) -> Tokenizer: def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto) tokenizer = self.tokenizer(self.proto)
# Add user defined symbols
user_defined_symbols = [
AddedToken(token, normalized=True, special=False) for token in self.proto.trainer_spec.user_defined_symbols
]
control_symbols = [
AddedToken(token, normalized=True, special=False) for token in self.proto.trainer_spec.control_symbols
]
tokenizer.add_tokens(user_defined_symbols + control_symbols)
# Tokenizer assemble # Tokenizer assemble
normalizer = self.normalizer(self.proto) normalizer = self.normalizer(self.proto)
if normalizer is not None: if normalizer is not None:
tokenizer.normalizer = normalizer tokenizer.normalizer = normalizer
replacement = "" replacement = ""
add_prefix_space = True # TODO:ita added 1
if hasattr(self.original_tokenizer, "add_prefix_space"): add_prefix_space = self.proto.normalizer_spec.add_dummy_prefix
# tokenizer.add_prefix_space = add_prefix_space
if hasattr(self.original_tokenizer,"add_prefix_space") and self.original_tokenizer.add_prefix_space is not None:
add_prefix_space = self.original_tokenizer.add_prefix_space add_prefix_space = self.original_tokenizer.add_prefix_space
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space) pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)

View File

@ -167,8 +167,8 @@ class LlamaTokenizer(PreTrainedTokenizer):
self.add_bos_token = add_bos_token self.add_bos_token = add_bos_token
self.add_eos_token = add_eos_token self.add_eos_token = add_eos_token
self.use_default_system_prompt = use_default_system_prompt self.use_default_system_prompt = use_default_system_prompt
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
self.add_prefix_space = add_prefix_space self.add_prefix_space = add_prefix_space
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
@ -203,6 +203,7 @@ class LlamaTokenizer(PreTrainedTokenizer):
model = model_pb2.ModelProto.FromString(sp_model) model = model_pb2.ModelProto.FromString(sp_model)
normalizer_spec = model_pb2.NormalizerSpec() normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False normalizer_spec.add_dummy_prefix = False
self.add_prefix_space = normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
model.normalizer_spec.MergeFrom(normalizer_spec) model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString() sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model) tokenizer.LoadFromSerializedProto(sp_model)

View File

@ -13,10 +13,12 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import json
from shutil import copyfile from shutil import copyfile
from typing import Optional, Tuple from typing import Optional, Tuple
from tokenizers import processors from tokenizers import pre_tokenizers, normalizers, processors
from ...tokenization_utils_fast import PreTrainedTokenizerFast from ...tokenization_utils_fast import PreTrainedTokenizerFast
from ...utils import is_sentencepiece_available, logging from ...utils import is_sentencepiece_available, logging
@ -150,7 +152,12 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
legacy = True legacy = True
self.legacy = legacy self.legacy = legacy
if add_prefix_space is not None: #TODO: ita
self.add_prefix_space = add_prefix_space
# if add_prefix_space is not None:
# kwargs["from_slow"] = True
if self.force_from_slow() is True:
kwargs["from_slow"] = True kwargs["from_slow"] = True
super().__init__( super().__init__(
@ -203,6 +210,48 @@ class LlamaTokenizerFast(PreTrainedTokenizerFast):
single=single, pair=pair, special_tokens=special_tokens single=single, pair=pair, special_tokens=special_tokens
) )
def force_from_slow(self):
if getattr(self, "add_prefix_space") == None:
if getattr(self, "_tokenizer", None) is None:
return True
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
if not prepend_normalizer:
return True
def update_normalizer(self):
"""Updates the underlying normalizer with the current `add_prefix_space` and `legacy` settings."""
sequence = []
if getattr(self, "legacy", True):
if getattr(self, "add_prefix_space", True):
sequence += [normalizers.Prepend(prepend="")]
sequence += [normalizers.Replace(pattern=" ", content="")]
elif not getattr(self, "legacy", True):
self._tokenizer.normalizer = normalizers.Sequence(sequence)
def update_pre_tokenizer(self):
"""Updates the underlying pre-tokenizer with the current `add_prefix_space` setting."""
sequence = []
if getattr(self, "add_prefix_space") == False:
prepend_scheme = "never"
elif getattr(self, "add_prefix_space") == None:
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
if prepend_normalizer:
prepend_normalizer = prepend_normalizer[0]
replacement = prepend_normalizer['prepend']
self.add_prefix_space = True
else:
return
if getattr(self, "add_prefix_space", True):
prepend_scheme = "always"
if not getattr(self, "legacy", True):
prepend_scheme = "first"
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="", prepend_scheme=prepend_scheme,
split=False)
self.update_normalizer()
@property @property
def add_eos_token(self): def add_eos_token(self):
return self._add_eos_token return self._add_eos_token

View File

@ -134,7 +134,7 @@ class T5Tokenizer(PreTrainedTokenizer):
additional_special_tokens=None, additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None, sp_model_kwargs: Optional[Dict[str, Any]] = None,
legacy=None, legacy=None,
add_prefix_space=True, add_prefix_space=None,
**kwargs, **kwargs,
) -> None: ) -> None:
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
@ -181,10 +181,10 @@ class T5Tokenizer(PreTrainedTokenizer):
legacy = True legacy = True
self.legacy = legacy self.legacy = legacy
self.add_prefix_space = add_prefix_space
self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
self.vocab_file = vocab_file self.vocab_file = vocab_file
self._extra_ids = extra_ids self._extra_ids = extra_ids
self.add_prefix_space = add_prefix_space
super().__init__( super().__init__(
eos_token=eos_token, eos_token=eos_token,
@ -211,6 +211,7 @@ class T5Tokenizer(PreTrainedTokenizer):
model = model_pb2.ModelProto.FromString(sp_model) model = model_pb2.ModelProto.FromString(sp_model)
normalizer_spec = model_pb2.NormalizerSpec() normalizer_spec = model_pb2.NormalizerSpec()
normalizer_spec.add_dummy_prefix = False normalizer_spec.add_dummy_prefix = False
self.add_prefix_space = normalizer_spec.add_dummy_prefix if self.add_prefix_space is None else self.add_prefix_space
model.normalizer_spec.MergeFrom(normalizer_spec) model.normalizer_spec.MergeFrom(normalizer_spec)
sp_model = model.SerializeToString() sp_model = model.SerializeToString()
tokenizer.LoadFromSerializedProto(sp_model) tokenizer.LoadFromSerializedProto(sp_model)

View File

@ -114,7 +114,9 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
logger.warning_once( logger.warning_once(
"You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers" "You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers"
) )
kwargs["from_slow"] = True # kwargs["from_slow"] = True
self.add_prefix_space = add_prefix_space
super().__init__( super().__init__(
vocab_file, vocab_file,
@ -123,6 +125,7 @@ class T5TokenizerFast(PreTrainedTokenizerFast):
unk_token=unk_token, unk_token=unk_token,
pad_token=pad_token, pad_token=pad_token,
extra_ids=extra_ids, extra_ids=extra_ids,
add_prefix_space=add_prefix_space,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
**kwargs, **kwargs,
) )

View File

@ -607,6 +607,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase):
f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}" f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}"
) )
# ["This is something", "<special_token_1>", "else"] # ["This is something", "<special_token_1>", "else"]
import json
tokenized_text = [] tokenized_text = []
for token in tokens: for token in tokens:
# Need to skip eventual empty (fully stripped) tokens # Need to skip eventual empty (fully stripped) tokens

View File

@ -23,6 +23,7 @@ import os
from collections import defaultdict from collections import defaultdict
from typing import Any, Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
from tokenizers import pre_tokenizers, normalizers, processors
import tokenizers.pre_tokenizers as pre_tokenizers_fast import tokenizers.pre_tokenizers as pre_tokenizers_fast
from tokenizers import Encoding as EncodingFast from tokenizers import Encoding as EncodingFast
from tokenizers import Tokenizer as TokenizerFast from tokenizers import Tokenizer as TokenizerFast
@ -102,15 +103,16 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
from_slow = kwargs.pop("from_slow", False) from_slow = kwargs.pop("from_slow", False)
added_tokens_decoder = kwargs.pop("added_tokens_decoder", {}) added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None: # TODO:Ita
raise ValueError( # if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you " # raise ValueError(
"have sentencepiece installed." # "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
) # "have sentencepiece installed."
# )
if tokenizer_object is not None: if tokenizer_object is not None:
fast_tokenizer = copy.deepcopy(tokenizer_object) fast_tokenizer = copy.deepcopy(tokenizer_object)
elif fast_tokenizer_file is not None and not from_slow: elif fast_tokenizer_file is not None: # and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend # We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None: elif slow_tokenizer is not None:
@ -124,6 +126,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
# We need to create and convert a slow tokenizer to build the backend # We need to create and convert a slow tokenizer to build the backend
slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs)
fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
elif slow_tokenizer is not None:
# We need to convert a slow tokenizer to build the backend
tokenizer_dict = load_gguf_checkpoint(kwargs.get("vocab_file"))["tokenizer"]
fast_tokenizer = convert_gguf_tokenizer(tokenizer_dict)
else: else:
raise ValueError( raise ValueError(
"Couldn't instantiate the backend tokenizer from one of: \n" "Couldn't instantiate the backend tokenizer from one of: \n"
@ -135,6 +142,8 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
self._tokenizer = fast_tokenizer self._tokenizer = fast_tokenizer
self.update_pre_tokenizer()
if slow_tokenizer is not None: if slow_tokenizer is not None:
kwargs.update(slow_tokenizer.init_kwargs) kwargs.update(slow_tokenizer.init_kwargs)
@ -861,3 +870,46 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
kwargs["additional_special_tokens"] = additional_special_tokens kwargs["additional_special_tokens"] = additional_special_tokens
return self.__class__(tokenizer_object=tokenizer, **kwargs) return self.__class__(tokenizer_object=tokenizer, **kwargs)
def update_normalizer(self):
"""Updates the underlying normalizer with the current `add_prefix_space` and `legacy` settings."""
sequence = []
if getattr(self, "legacy", True):
if getattr(self, "add_prefix_space", True):
sequence += [normalizers.Prepend(prepend="")]
sequence += [normalizers.Replace(pattern=" ", content="")]
self._tokenizer.normalizer = normalizers.Sequence(sequence)
elif not getattr(self, "legacy", True):
return
self._tokenizer.normalizer = normalizers.Sequence(sequence) #TODO:ita2
def update_pre_tokenizer(self):
"""Updates the underlying pre-tokenizer with the current `add_prefix_space` setting."""
sequence = []
if getattr(self, "add_prefix_space", None) == None:
if getattr(self._tokenizer, "normalizer", None) == None:
return
curr_normalizer = json.loads(self._tokenizer.normalizer.__getstate__().decode('utf-8'))
if 'normalizers' not in curr_normalizer:
return
prepend_normalizer = [n for n in curr_normalizer['normalizers'] if n['type'] == 'Prepend']
if prepend_normalizer:
prepend_normalizer = prepend_normalizer[0]
replacement = prepend_normalizer['prepend']
self.add_prefix_space = True
else:
return
elif getattr(self, "add_prefix_space") == False:
prepend_scheme = "never"
if getattr(self, "add_prefix_space", True):
prepend_scheme = "always"
if not getattr(self, "legacy", True):
prepend_scheme = "first"
self._tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(replacement="", prepend_scheme=prepend_scheme,
split=False)
self.update_normalizer()