Moving fill-mask pipeline to new testing scheme (#12943)

* Fill mask pipelines test updates.

* Model eval !!

* Adding slow test with actual values.

* Making all tests pass (skipping quite a bit.)

* Doc styling.

* Better doc cleanup.

* Making an explicit test with no pad token tokenizer.

* Typo.
This commit is contained in:
Nicolas Patry 2021-08-13 12:04:18 +02:00 committed by GitHub
parent a04d4bf2d7
commit d58926ab1d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 367 additions and 304 deletions

View File

@ -748,6 +748,8 @@ class Pipeline(_ScikitCompat):
Parse arguments and tokenize
"""
# Parse arguments
if self.tokenizer.pad_token is None:
padding = False
inputs = self.tokenizer(
inputs,
add_special_tokens=add_special_tokens,

View File

@ -1,4 +1,4 @@
from typing import TYPE_CHECKING, Optional, Union
from typing import TYPE_CHECKING, Dict, List, Optional, Union
import numpy as np
@ -9,6 +9,8 @@ from ..utils import logging
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
if TYPE_CHECKING:
from ..modeling_tf_utils import TFPreTrainedModel
from ..modeling_utils import PreTrainedModel
@ -30,7 +32,13 @@ logger = logging.get_logger(__name__)
@add_end_docstrings(
PIPELINE_INIT_ARGS,
r"""
top_k (:obj:`int`, defaults to 5): The number of predictions to return.
top_k (:obj:`int`, defaults to 5):
The number of predictions to return.
targets (:obj:`str` or :obj:`List[str]`, `optional`):
When passed, the model will limit the scores to the passed targets instead of looking up in the whole
vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
token will be used (with a warning, and that might be slower).
""",
)
class FillMaskPipeline(Pipeline):
@ -59,6 +67,7 @@ class FillMaskPipeline(Pipeline):
args_parser: ArgumentHandler = None,
device: int = -1,
top_k=5,
targets=None,
task: str = "",
):
super().__init__(
@ -74,8 +83,23 @@ class FillMaskPipeline(Pipeline):
self.check_model_type(TF_MODEL_WITH_LM_HEAD_MAPPING if self.framework == "tf" else MODEL_FOR_MASKED_LM_MAPPING)
self.top_k = top_k
self.targets = targets
if self.tokenizer.mask_token_id is None:
raise PipelineException(
"fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
)
def ensure_exactly_one_mask_token(self, masked_index: np.ndarray):
def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
elif self.framework == "pt":
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
else:
raise ValueError("Unsupported framework")
return masked_index
def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
masked_index = self.get_masked_index(input_ids)
numel = np.prod(masked_index.shape)
if numel > 1:
raise PipelineException(
@ -90,7 +114,25 @@ class FillMaskPipeline(Pipeline):
f"No mask_token ({self.tokenizer.mask_token}) found on the input",
)
def __call__(self, *args, targets=None, top_k: Optional[int] = None, **kwargs):
def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
if isinstance(model_inputs, list):
for model_input in model_inputs:
self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
else:
for input_ids in model_inputs["input_ids"]:
self._ensure_exactly_one_mask_token(input_ids)
def get_model_inputs(self, inputs, *args, **kwargs) -> Dict:
if isinstance(inputs, list) and self.tokenizer.pad_token is None:
model_inputs = []
for input_ in inputs:
model_input = self._parse_and_tokenize(input_, padding=False, *args, **kwargs)
model_inputs.append(model_input)
else:
model_inputs = self._parse_and_tokenize(inputs, *args, **kwargs)
return model_inputs
def __call__(self, inputs, *args, targets=None, top_k: Optional[int] = None, **kwargs):
"""
Fill the masked token in the text(s) given as inputs.
@ -112,16 +154,27 @@ class FillMaskPipeline(Pipeline):
- **token** (:obj:`int`) -- The predicted token id (to replace the masked one).
- **token** (:obj:`str`) -- The predicted token (to replace the masked one).
"""
inputs = self._parse_and_tokenize(*args, **kwargs)
outputs = self._forward(inputs, return_tensors=True)
model_inputs = self.get_model_inputs(inputs, *args, **kwargs)
self.ensure_exactly_one_mask_token(model_inputs)
if isinstance(model_inputs, list):
outputs = []
for model_input in model_inputs:
output = self._forward(model_input, return_tensors=True)
outputs.append(output)
batch_size = len(model_inputs)
else:
outputs = self._forward(model_inputs, return_tensors=True)
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
# top_k must be defined
if top_k is None:
top_k = self.top_k
results = []
batch_size = outputs.shape[0] if self.framework == "tf" else outputs.size(0)
if targets is None and self.targets is not None:
targets = self.targets
if targets is not None:
if isinstance(targets, str):
targets = [targets]
@ -167,16 +220,21 @@ class FillMaskPipeline(Pipeline):
top_k = target_ids.shape[0]
for i in range(batch_size):
input_ids = inputs["input_ids"][i]
if isinstance(model_inputs, list):
input_ids = model_inputs[i]["input_ids"][0]
else:
input_ids = model_inputs["input_ids"][i]
result = []
if self.framework == "tf":
masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
# Fill mask pipeline supports only one ${mask_token} per sample
self.ensure_exactly_one_mask_token(masked_index)
logits = outputs[i, masked_index.item(), :]
if isinstance(outputs, list):
logits = outputs[i][0, masked_index.item(), :]
else:
logits = outputs[i, masked_index.item(), :]
probs = tf.nn.softmax(logits)
if targets is not None:
probs = tf.gather_nd(probs, tf.reshape(target_ids, (-1, 1)))
@ -185,11 +243,12 @@ class FillMaskPipeline(Pipeline):
values, predictions = topk.values.numpy(), topk.indices.numpy()
else:
masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
# Fill mask pipeline supports only one ${mask_token} per sample
self.ensure_exactly_one_mask_token(masked_index.numpy())
logits = outputs[i, masked_index.item(), :]
if isinstance(outputs, list):
logits = outputs[i][0, masked_index.item(), :]
else:
logits = outputs[i, masked_index.item(), :]
probs = logits.softmax(dim=0)
if targets is not None:
probs = probs[..., target_ids]

View File

@ -189,6 +189,7 @@ class ReformerModelTester:
def get_pipeline_config(self):
config = self.get_config()
config.vocab_size = 100
config.is_decoder = False
return config
def create_and_check_reformer_model(self, config, input_ids, input_mask, choice_labels):

View File

@ -74,10 +74,10 @@ def get_tiny_config_from_class(configuration_class):
@lru_cache(maxsize=100)
def get_tiny_tokenizer_from_checkpoint(checkpoint):
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
logger.warning("Training new from iterator ...")
logger.info("Training new from iterator ...")
vocabulary = string.ascii_letters + string.digits + " "
tokenizer = tokenizer.train_new_from_iterator(vocabulary, vocab_size=len(vocabulary), show_progress=False)
logger.warning("Trained.")
logger.info("Trained.")
return tokenizer
@ -109,9 +109,7 @@ class PipelineTestCaseMeta(type):
# Some test tokenizer contain broken vocabs or custom PreTokenizer, so we
# provide some default tokenizer and hope for the best.
except: # noqa: E722
logger.warning(f"Tokenizer cannot be created from checkpoint {checkpoint}")
tokenizer = get_tiny_tokenizer_from_checkpoint("gpt2")
tokenizer.model_max_length = model.config.max_position_embeddings
self.skipTest(f"Ignoring {ModelClass}, cannot create a simple tokenizer")
self.run_pipeline_test(model, tokenizer)
return test

View File

@ -14,304 +14,307 @@
import unittest
from transformers import pipeline
from transformers.testing_utils import nested_simplify, require_tf, require_torch, slow
from transformers import MODEL_FOR_MASKED_LM_MAPPING, TF_MODEL_FOR_MASKED_LM_MAPPING, FillMaskPipeline, pipeline
from transformers.pipelines import PipelineException
from transformers.testing_utils import is_pipeline_test, nested_simplify, require_tf, require_torch, slow
from .test_pipelines_common import MonoInputPipelineCommonMixin
from .test_pipelines_common import ANY, PipelineTestCaseMeta
EXPECTED_FILL_MASK_RESULT = [
[
{"sequence": "My name is John", "score": 0.00782308354973793, "token": 610, "token_str": " John"},
{"sequence": "My name is Chris", "score": 0.007475061342120171, "token": 1573, "token_str": " Chris"},
],
[
{
"sequence": "The largest city in France is Paris",
"score": 0.2510891854763031,
"token": 2201,
"token_str": " Paris",
},
{
"sequence": "The largest city in France is Lyon",
"score": 0.21418564021587372,
"token": 12790,
"token_str": " Lyon",
},
],
]
@is_pipeline_test
class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_MASKED_LM_MAPPING
tf_model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING
EXPECTED_FILL_MASK_TARGET_RESULT = [EXPECTED_FILL_MASK_RESULT[0]]
class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
pipeline_task = "fill-mask"
pipeline_loading_kwargs = {"top_k": 2}
small_models = ["sshleifer/tiny-distilroberta-base"] # Models tested without the @slow decorator
large_models = ["distilroberta-base"] # Models tested with the @slow decorator
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
invalid_inputs = [
"This is <mask> <mask>" # More than 1 mask_token in the input is not supported
"This is" # No mask_token is not supported
]
expected_check_keys = ["sequence"]
@require_torch
def test_torch_fill_mask(self):
valid_inputs = "My name is <mask>"
unmasker = pipeline(task="fill-mask", model=self.small_models[0])
outputs = unmasker(valid_inputs)
self.assertIsInstance(outputs, list)
# This passes
outputs = unmasker(valid_inputs, targets=[" Patrick", " Clara"])
self.assertIsInstance(outputs, list)
# This used to fail with `cannot mix args and kwargs`
outputs = unmasker(valid_inputs, something=False)
self.assertIsInstance(outputs, list)
@require_torch
def test_torch_fill_mask_with_targets(self):
valid_inputs = ["My name is <mask>"]
# ' Sam' will yield a warning but work
valid_targets = [[" Teven", "ĠPatrick", "ĠClara"], ["ĠSam"], [" Sam"]]
invalid_targets = [[], [""], ""]
for model_name in self.small_models:
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
for targets in valid_targets:
outputs = unmasker(valid_inputs, targets=targets)
self.assertIsInstance(outputs, list)
self.assertEqual(len(outputs), len(targets))
for targets in invalid_targets:
self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)
@require_torch
@slow
def test_torch_fill_mask_targets_equivalence(self):
model_name = self.large_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
unmasked = unmasker(self.valid_inputs[0])
tokens = [top_mask["token_str"] for top_mask in unmasked]
scores = [top_mask["score"] for top_mask in unmasked]
unmasked_targets = unmasker(self.valid_inputs[0], targets=tokens)
target_scores = [top_mask["score"] for top_mask in unmasked_targets]
self.assertEqual(scores, target_scores)
@require_torch
def test_torch_fill_mask_with_targets_and_topk(self):
model_name = self.small_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
targets = [" Teven", "ĠPatrick", "ĠClara"]
top_k = 2
outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
@require_tf
def test_small_model_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="tf")
outputs = unmasker("My name is <mask>")
self.assertEqual(
nested_simplify(outputs),
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 0.0, "token": 2941, "token_str": " Te"},
{"sequence": "My name is grouped", "score": 2.1e-05, "token": 38015, "token_str": " grouped"},
{"sequence": "My name is accuser", "score": 2.1e-05, "token": 25506, "token_str": " accuser"},
],
)
outputs = unmasker("The largest city in France is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{
"sequence": "The largest city in France is grouped",
"score": 2.1e-05,
"token": 38015,
"token_str": " grouped",
},
{
"sequence": "The largest city in France is accuser",
"score": 2.1e-05,
"token": 25506,
"token_str": " accuser",
},
],
)
outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Patrick", "score": 2e-05, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 1.9e-05, "token": 2941, "token_str": " Te"},
],
)
@require_torch
def test_torch_fill_mask_with_duplicate_targets_and_topk(self):
model_name = self.small_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
def test_small_model_pt(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", top_k=2, framework="pt")
outputs = unmasker("My name is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Maul", "score": 2.2e-05, "token": 35676, "token_str": " Maul"},
{"sequence": "My name isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
],
)
outputs = unmasker("The largest city in France is <mask>")
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{
"sequence": "The largest city in France is Maul",
"score": 2.2e-05,
"token": 35676,
"token_str": " Maul",
},
{"sequence": "The largest city in France isELS", "score": 2.2e-05, "token": 16416, "token_str": "ELS"},
],
)
outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
{"sequence": "My name is Patrick", "score": 2.1e-05, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 2e-05, "token": 2941, "token_str": " Te"},
{"sequence": "My name is Clara", "score": 2e-05, "token": 13606, "token_str": " Clara"},
],
)
@slow
@require_torch
def test_large_model_pt(self):
unmasker = pipeline(task="fill-mask", model="distilroberta-base", top_k=2, framework="pt")
self.run_large_test(unmasker)
@slow
@require_tf
def test_large_model_tf(self):
unmasker = pipeline(task="fill-mask", model="distilroberta-base", top_k=2, framework="tf")
self.run_large_test(unmasker)
def run_large_test(self, unmasker):
outputs = unmasker("My name is <mask>")
self.assertEqual(
nested_simplify(outputs),
[
{"sequence": "My name is John", "score": 0.008, "token": 610, "token_str": " John"},
{"sequence": "My name is Chris", "score": 0.007, "token": 1573, "token_str": " Chris"},
],
)
outputs = unmasker("The largest city in France is <mask>")
self.assertEqual(
nested_simplify(outputs),
[
{
"sequence": "The largest city in France is Paris",
"score": 0.251,
"token": 2201,
"token_str": " Paris",
},
{
"sequence": "The largest city in France is Lyon",
"score": 0.214,
"token": 12790,
"token_str": " Lyon",
},
],
)
outputs = unmasker("My name is <mask>", targets=[" Patrick", " Clara", " Teven"], top_k=3)
self.assertEqual(
nested_simplify(outputs),
[
{"sequence": "My name is Patrick", "score": 0.005, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Clara", "score": 0.000, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Te", "score": 0.000, "token": 2941, "token_str": " Te"},
],
)
@require_torch
def test_model_no_pad_pt(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="pt")
unmasker.tokenizer.pad_token_id = None
unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker.model, unmasker.tokenizer)
@require_tf
def test_model_no_pad_tf(self):
unmasker = pipeline(task="fill-mask", model="sshleifer/tiny-distilroberta-base", framework="tf")
unmasker.tokenizer.pad_token_id = None
unmasker.tokenizer.pad_token = None
self.run_pipeline_test(unmasker.model, unmasker.tokenizer)
def run_pipeline_test(self, model, tokenizer):
if tokenizer.mask_token_id is None:
self.skipTest("The provided tokenizer has no mask token, (probably reformer)")
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
outputs = fill_masker(f"This is a {tokenizer.mask_token}")
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
)
outputs = fill_masker([f"This is a {tokenizer.mask_token}", f"Another {tokenizer.mask_token}"])
self.assertEqual(
outputs,
[
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
],
)
with self.assertRaises(ValueError):
fill_masker([None])
# Multiple masks
with self.assertRaises(PipelineException):
fill_masker(f"This is {tokenizer.mask_token} {tokenizer.mask_token}")
# No mask_token is not supported
with self.assertRaises(PipelineException):
fill_masker("This is")
self.run_test_top_k(model, tokenizer)
self.run_test_targets(model, tokenizer)
self.run_test_top_k_targets(model, tokenizer)
self.fill_mask_with_duplicate_targets_and_top_k(model, tokenizer)
def run_test_targets(self, model, tokenizer):
vocab = tokenizer.get_vocab()
targets = list(sorted(vocab.keys()))[:2]
# Pipeline argument
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, targets=targets)
outputs = fill_masker(f"This is a {tokenizer.mask_token}")
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
)
target_ids = {vocab[el] for el in targets}
self.assertEqual(set(el["token"] for el in outputs), target_ids)
self.assertEqual(set(el["token_str"] for el in outputs), set(targets))
# Call argument
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
)
target_ids = {vocab[el] for el in targets}
self.assertEqual(set(el["token"] for el in outputs), target_ids)
self.assertEqual(set(el["token_str"] for el in outputs), set(targets))
# Score equivalence
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
tokens = [top_mask["token_str"] for top_mask in outputs]
scores = [top_mask["score"] for top_mask in outputs]
unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
target_scores = [top_mask["score"] for top_mask in unmasked_targets]
self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
# Raises with invalid
with self.assertRaises(ValueError):
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
with self.assertRaises(ValueError):
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
with self.assertRaises(ValueError):
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
def run_test_top_k(self, model, tokenizer):
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
outputs = fill_masker(f"This is a {tokenizer.mask_token}")
self.assertEqual(
outputs,
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
)
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2)
self.assertEqual(
outputs2,
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
)
self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
def run_test_top_k_targets(self, model, tokenizer):
vocab = tokenizer.get_vocab()
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
# top_k=2, ntargets=3
targets = list(sorted(vocab.keys()))[:3]
outputs = fill_masker(f"This is a {tokenizer.mask_token}", top_k=2, targets=targets)
# If we use the most probably targets, and filter differently, we should still
# have the same results
targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
# They should yield exactly the same result
self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
vocab = tokenizer.get_vocab()
# String duplicates + id duplicates
targets = [" Teven", "ĠPatrick", "ĠClara", "ĠClara", " Clara"]
top_k = 10
outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
targets = list(sorted(vocab.keys()))[:3]
targets = [targets[0], targets[1], targets[0], targets[2], targets[1]]
outputs = fill_masker(f"My name is {tokenizer.mask_token}", targets=targets, top_k=10)
# The target list contains duplicates, so we can't output more
# than them
self.assertEqual(len(outputs), 3)
@require_tf
def test_tf_fill_mask_with_targets(self):
valid_inputs = ["My name is <mask>"]
# ' Teven' will yield a warning but work as " Te"
invalid_targets = [[], [""], ""]
unmasker = pipeline(
task="fill-mask", model=self.small_models[0], tokenizer=self.small_models[0], framework="tf"
)
outputs = unmasker(valid_inputs, targets=[" Teven", "ĠPatrick", "ĠClara"])
self.assertEqual(
nested_simplify(outputs),
[
{"sequence": "My name is Clara", "score": 0.0, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
{"sequence": "My name is Te", "score": 0.0, "token": 2941, "token_str": " Te"},
],
)
# topk
outputs = unmasker(valid_inputs, targets=[" Teven", "ĠPatrick", "ĠClara"], top_k=2)
self.assertEqual(
nested_simplify(outputs),
[
{"sequence": "My name is Clara", "score": 0.0, "token": 13606, "token_str": " Clara"},
{"sequence": "My name is Patrick", "score": 0.0, "token": 3499, "token_str": " Patrick"},
],
)
for targets in invalid_targets:
with self.assertRaises(ValueError):
unmasker(valid_inputs, targets=targets)
@require_torch
@slow
def test_torch_fill_mask_results(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
valid_targets = ["ĠPatrick", "ĠClara"]
for model_name in self.large_models:
unmasker = pipeline(
task="fill-mask",
model=model_name,
tokenizer=model_name,
framework="pt",
top_k=2,
)
mono_result = unmasker(valid_inputs[0], targets=valid_targets)
self.assertIsInstance(mono_result, list)
self.assertIsInstance(mono_result[0], dict)
for mandatory_key in mandatory_keys:
self.assertIn(mandatory_key, mono_result[0])
multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
for r, e in zip(result, expected):
self.assertEqual(r["sequence"], e["sequence"])
self.assertEqual(r["token_str"], e["token_str"])
self.assertEqual(r["token"], e["token"])
self.assertAlmostEqual(r["score"], e["score"], places=3)
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
for result in multi_result:
for key in mandatory_keys:
self.assertIn(key, result)
self.assertRaises(Exception, unmasker, [None])
valid_inputs = valid_inputs[:1]
mono_result = unmasker(valid_inputs[0], targets=valid_targets)
self.assertIsInstance(mono_result, list)
self.assertIsInstance(mono_result[0], dict)
for mandatory_key in mandatory_keys:
self.assertIn(mandatory_key, mono_result[0])
multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
for r, e in zip(result, expected):
self.assertEqual(r["sequence"], e["sequence"])
self.assertEqual(r["token_str"], e["token_str"])
self.assertEqual(r["token"], e["token"])
self.assertAlmostEqual(r["score"], e["score"], places=3)
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
for result in multi_result:
for key in mandatory_keys:
self.assertIn(key, result)
self.assertRaises(Exception, unmasker, [None])
@require_tf
@slow
def test_tf_fill_mask_results(self):
mandatory_keys = {"sequence", "score", "token"}
valid_inputs = [
"My name is <mask>",
"The largest city in France is <mask>",
]
valid_targets = ["ĠPatrick", "ĠClara"]
for model_name in self.large_models:
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)
mono_result = unmasker(valid_inputs[0], targets=valid_targets)
self.assertIsInstance(mono_result, list)
self.assertIsInstance(mono_result[0], dict)
for mandatory_key in mandatory_keys:
self.assertIn(mandatory_key, mono_result[0])
multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
for result, expected in zip(multi_result, EXPECTED_FILL_MASK_RESULT):
for r, e in zip(result, expected):
self.assertEqual(r["sequence"], e["sequence"])
self.assertEqual(r["token_str"], e["token_str"])
self.assertEqual(r["token"], e["token"])
self.assertAlmostEqual(r["score"], e["score"], places=3)
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
for result in multi_result:
for key in mandatory_keys:
self.assertIn(key, result)
self.assertRaises(Exception, unmasker, [None])
valid_inputs = valid_inputs[:1]
mono_result = unmasker(valid_inputs[0], targets=valid_targets)
self.assertIsInstance(mono_result, list)
self.assertIsInstance(mono_result[0], dict)
for mandatory_key in mandatory_keys:
self.assertIn(mandatory_key, mono_result[0])
multi_result = [unmasker(valid_input) for valid_input in valid_inputs]
self.assertIsInstance(multi_result, list)
self.assertIsInstance(multi_result[0], (dict, list))
for result, expected in zip(multi_result, EXPECTED_FILL_MASK_TARGET_RESULT):
for r, e in zip(result, expected):
self.assertEqual(r["sequence"], e["sequence"])
self.assertEqual(r["token_str"], e["token_str"])
self.assertEqual(r["token"], e["token"])
self.assertAlmostEqual(r["score"], e["score"], places=3)
if isinstance(multi_result[0], list):
multi_result = multi_result[0]
for result in multi_result:
for key in mandatory_keys:
self.assertIn(key, result)
self.assertRaises(Exception, unmasker, [None])
@require_tf
@slow
def test_tf_fill_mask_targets_equivalence(self):
model_name = self.large_models[0]
unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
unmasked = unmasker(self.valid_inputs[0])
tokens = [top_mask["token_str"] for top_mask in unmasked]
scores = [top_mask["score"] for top_mask in unmasked]
unmasked_targets = unmasker(self.valid_inputs[0], targets=tokens)
target_scores = [top_mask["score"] for top_mask in unmasked_targets]
self.assertEqual(scores, target_scores)