Revert error back into warning for byte fallback conversion. (#22607)

This commit is contained in:
Nicolas Patry 2023-04-06 14:00:29 +02:00 committed by GitHub
parent 1670be4bde
commit 0aa1153ffb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 6 additions and 3 deletions

View File

@ -19,6 +19,7 @@ All the conversions are grouped here to gather SentencePiece dependencies outsid
allow to make our dependency on SentencePiece optional.
"""
import warnings
from typing import Dict, List, Tuple
from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, processors
@ -450,7 +451,7 @@ class SpmConverter(Converter):
if self.proto.trainer_spec.byte_fallback:
if not getattr(self, "handle_byte_fallback", None):
raise RuntimeError(
warnings.warn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "

View File

@ -24,10 +24,12 @@ class ConvertSlowTokenizerTest(unittest.TestCase):
original_tokenizer_with_bytefallback = FakeOriginalTokenizer(vocab_file=spm_model_file_with_bytefallback)
with self.assertRaises(RuntimeError) as cm:
with warnings.catch_warnings(record=True) as w:
_ = SpmConverter(original_tokenizer_with_bytefallback)
self.assertEqual(len(w), 1)
self.assertIn(
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers.",
str(cm.exception),
str(w[0].message),
)