Add test for Chinese tokenization

This commit is contained in:
Yaser Martinez Palenzuela 2018-11-05 23:04:29 +01:00 committed by GitHub
parent 0ce2f496dc
commit 4d124baf8f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 7 additions and 0 deletions

View File

@ -43,6 +43,13 @@ class TokenizationTest(unittest.TestCase):
self.assertListEqual(
tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
def test_chinese(self):
tokenizer = tokenization.BasicTokenizer()
self.assertListEqual(
tokenizer.tokenize(u"ah\u535A\u63A8zz"),
[u"ah", u"\u535A", u"\u63A8", u"zz"])
def test_basic_tokenizer_lower(self):
tokenizer = tokenization.BasicTokenizer(do_lower_case=True)