Add token type ids to CodeGenTokenizer (#29265)

* Add create token type ids to CodeGenTokenizer

* Fix inconsistent length of token type ids

* Format source codes

* Fix inconsistent order of methods

* Update docstring

* add test_tokenizer_integration test

* Format source codes

* Add `copied from` comment to CodeGenTokenizerFast

* Add doc of create_token_type_ids_from_sequences

* Make return_token_type_ids False by default

* Make test_tokenizer_integration as slow test

* Add return_token_type_ids to tokenizer init arg

* Add test for tokenizer's init return_token_type_ids

* Format source codes
This commit is contained in:
st81 2024-04-17 19:19:18 +09:00 committed by GitHub
parent 812a5de229
commit 8d6b509611
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 127 additions and 0 deletions

View File

@ -72,6 +72,7 @@ hello_world()
## CodeGenTokenizer
[[autodoc]] CodeGenTokenizer
- create_token_type_ids_from_sequences
- save_vocabulary
## CodeGenTokenizerFast

View File

@ -134,6 +134,8 @@ class CodeGenTokenizer(PreTrainedTokenizer):
other word. (CodeGen tokenizer detect beginning of words by the preceding space).
add_bos_token (`bool`, *optional*, defaults to `False`):
Whether to add a beginning of sequence token at the start of sequences.
return_token_type_ids (`bool`, *optional*, defaults to `False`):
Whether to return token type IDs.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -150,6 +152,7 @@ class CodeGenTokenizer(PreTrainedTokenizer):
pad_token=None,
add_prefix_space=False,
add_bos_token=False,
return_token_type_ids=False,
**kwargs,
):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
@ -157,6 +160,9 @@ class CodeGenTokenizer(PreTrainedTokenizer):
unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
self.add_bos_token = add_bos_token
self.return_token_type_ids = return_token_type_ids
if self.return_token_type_ids:
self.model_input_names.append("token_type_ids")
with open(vocab_file, encoding="utf-8") as vocab_handle:
self.encoder = json.load(vocab_handle)
@ -181,6 +187,7 @@ class CodeGenTokenizer(PreTrainedTokenizer):
pad_token=pad_token,
add_prefix_space=add_prefix_space,
add_bos_token=add_bos_token,
return_token_type_ids=return_token_type_ids,
**kwargs,
)
@ -270,6 +277,35 @@ class CodeGenTokenizer(PreTrainedTokenizer):
text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
return text
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id] if self.sep_token_id is not None else []
cls = [self.cls_token_id] if self.sep_token_id is not None else []
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not os.path.isdir(save_directory):
logger.error(f"Vocabulary path ({save_directory}) should be a directory")

View File

@ -91,6 +91,8 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
add_prefix_space (`bool`, *optional*, defaults to `False`):
Whether or not to add an initial space to the input. This allows to treat the leading word just as any
other word. (CodeGen tokenizer detect beginning of words by the preceding space).
return_token_type_ids (`bool`, *optional*, defaults to `False`):
Whether to return token type IDs.
"""
vocab_files_names = VOCAB_FILES_NAMES
@ -106,8 +108,13 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
bos_token="<|endoftext|>",
eos_token="<|endoftext|>",
add_prefix_space=False,
return_token_type_ids=False,
**kwargs,
):
self.return_token_type_ids = return_token_type_ids
if self.return_token_type_ids:
self.model_input_names.append("token_type_ids")
super().__init__(
vocab_file,
merges_file,
@ -116,6 +123,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
bos_token=bos_token,
eos_token=eos_token,
add_prefix_space=add_prefix_space,
return_token_type_ids=return_token_type_ids,
**kwargs,
)
@ -157,6 +165,36 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
return super()._encode_plus(*args, **kwargs)
# Copied from transformers.models.codegen.tokenization_codegen.CodeGenTokenizer.create_token_type_ids_from_sequences
def create_token_type_ids_from_sequences(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
) -> List[int]:
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task. A sequence
pair mask has the following format:
```
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
```
If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (`List[int]`):
List of IDs.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
"""
sep = [self.sep_token_id] if self.sep_token_id is not None else []
cls = [self.cls_token_id] if self.sep_token_id is not None else []
if token_ids_1 is None:
return len(cls + token_ids_0 + sep) * [0]
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
files = self._tokenizer.model.save(save_directory, name=filename_prefix)
return tuple(files)

View File

@ -264,3 +264,55 @@ class CodeGenTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# tokenizer has no padding token
def test_padding_different_model_input_name(self):
pass
@slow
def test_tokenizer_integration(self):
# Custom test since this tokenizer takes return_token_type_ids as an init argument for backward compatibility.
sequences = [
"Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides "
"general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural "
"Language Understanding (NLU) and Natural Language Generation (NLG) with over 32+ pretrained "
"models in 100+ languages and deep interoperability between Jax, PyTorch and TensorFlow.",
"BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly "
"conditioning on both left and right context in all layers.",
"The quick brown fox jumps over the lazy dog.",
]
tokenizer_classes = [self.tokenizer_class]
if self.test_rust_tokenizer:
tokenizer_classes.append(self.rust_tokenizer_class)
# Test default case. i.e. return_token_type_ids is False.
for tokenizer_class in tokenizer_classes:
tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono")
encoding = tokenizer(sequences)
decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
# fmt: off
expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
# fmt: on
encoding_data = encoding.data
self.assertDictEqual(encoding_data, expected_encoding)
for expected, decoded in zip(sequences, decoded_sequences):
self.assertEqual(expected, decoded)
# Test return_token_type_ids is True case.
for tokenizer_class in tokenizer_classes:
tokenizer = tokenizer_class.from_pretrained("Salesforce/codegen-350M-mono", return_token_type_ids=True)
encoding = tokenizer(sequences)
decoded_sequences = [tokenizer.decode(seq, skip_special_tokens=True) for seq in encoding["input_ids"]]
# fmt: off
expected_encoding = {'input_ids': [[41762, 364, 357, 36234, 1900, 355, 12972, 13165, 354, 12, 35636, 364, 290, 12972, 13165, 354, 12, 5310, 13363, 12, 4835, 8, 3769, 2276, 12, 29983, 45619, 357, 13246, 51, 11, 402, 11571, 12, 17, 11, 5564, 13246, 38586, 11, 16276, 44, 11, 4307, 346, 33, 861, 11, 16276, 7934, 23029, 329, 12068, 15417, 28491, 357, 32572, 52, 8, 290, 12068, 15417, 16588, 357, 32572, 38, 8, 351, 625, 3933, 10, 2181, 13363, 4981, 287, 1802, 10, 8950, 290, 2769, 48817, 1799, 1022, 449, 897, 11, 9485, 15884, 354, 290, 309, 22854, 37535, 13], [13246, 51, 318, 3562, 284, 662, 12, 27432, 2769, 8406, 4154, 282, 24612, 422, 9642, 9608, 276, 2420, 416, 26913, 21143, 319, 1111, 1364, 290, 826, 4732, 287, 477, 11685, 13], [464, 2068, 7586, 21831, 18045, 625, 262, 16931, 3290, 13]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]} # noqa: E501
# fmt: on
encoding_data = encoding.data
self.assertDictEqual(encoding_data, expected_encoding)
for expected, decoded in zip(sequences, decoded_sequences):
self.assertEqual(expected, decoded)