up (#13988)
This commit is contained in:
parent
5b6bd4e788
commit
cc36064960
|
@ -237,7 +237,7 @@ class ByT5Tokenizer(PreTrainedTokenizer):
|
||||||
else:
|
else:
|
||||||
tok_string = bytes([ord(token)])
|
tok_string = bytes([ord(token)])
|
||||||
bstring += tok_string
|
bstring += tok_string
|
||||||
string = bstring.decode("utf-8")
|
string = bstring.decode("utf-8", errors="ignore")
|
||||||
return string
|
return string
|
||||||
|
|
||||||
# ByT5Tokenizer has no vocab file
|
# ByT5Tokenizer has no vocab file
|
||||||
|
|
|
@ -290,6 +290,22 @@ class ByT5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_decode_single_bytes(self):
|
||||||
|
tokenizer_list = []
|
||||||
|
if self.test_slow_tokenizer:
|
||||||
|
tokenizer_list.append((self.tokenizer_class, self.get_tokenizer()))
|
||||||
|
|
||||||
|
if self.test_rust_tokenizer:
|
||||||
|
tokenizer_list.append((self.rust_tokenizer_class, self.get_rust_tokenizer()))
|
||||||
|
|
||||||
|
for tokenizer_class, tokenizer_utils in tokenizer_list:
|
||||||
|
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||||
|
tokenizer_utils.save_pretrained(tmp_dir)
|
||||||
|
|
||||||
|
tokenizer = tokenizer_class.from_pretrained(tmp_dir)
|
||||||
|
|
||||||
|
self.assertTrue(tokenizer.decode([255]) == "")
|
||||||
|
|
||||||
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
|
# tokenizer can be instantiated without any pretrained files, so no need for pretrained tokenizer list
|
||||||
def test_pretrained_model_lists(self):
|
def test_pretrained_model_lists(self):
|
||||||
pass
|
pass
|
||||||
|
|
Loading…
Reference in New Issue