This commit is contained in:
Patrick von Platen 2021-10-14 10:54:20 +02:00 committed by GitHub
parent 5b6bd4e788
commit cc36064960
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 17 additions and 1 deletions

View File

@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
else:
tok_string = bytes([ord(token)])
bstring += tok_string
string = bstring.decode("utf-8")
string = bstring.decode("utf-8", errors="ignore")
return string
# ByT5Tokenizer has no vocab file

View File

@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
),
)
def test_decode_single_bytes(self):
tokenizer_list = []
if self.test_slow_tokenizer:
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
if self.test_rust_tokenizer:
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
for tokenizer_class, tokenizer_utils in tokenizer_list:
with tempfile.TemporaryDirectory() as tmp_dir:
tokenizer_utils.save_pretrained(tmp_dir)
tokenizer = tokenizer_class.from_pretrained(tmp_dir)
self.assertTrue(tokenizer.decode([255]) == "")
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
def test_pretrained_model_lists(self):
pass