[cleanup/marian] pipelines test and new kwarg (#4812)

2020-06-05 18:45:19 -04:00 · 2020-06-05 18:45:19 -04:00 · 4ab7424597
parent 875288b344
commit 4ab7424597
3 changed files with 12 additions and 6 deletions
--- a/src/transformers/tokenization_marian.py
+++ b/src/transformers/tokenization_marian.py
@ -48,13 +48,12 @@ class MarianTokenizer(PreTrainedTokenizer):
        unk_token="<unk>",
        eos_token="</s>",
        pad_token="<pad>",
-        max_len=512,
-        **kwargs,
+        model_max_length=512,
+        **kwargs
    ):
-
        super().__init__(
            # bos_token=bos_token,  unused. Start decoding with config.decoder_start_token_id
-            max_len=max_len,
+            model_max_length=model_max_length,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
--- a/tests/test_modeling_marian.py
+++ b/tests/test_modeling_marian.py
@ -38,6 +38,7 @@ if is_torch_available():
        convert_opus_name_to_hf_name,
        ORG_NAME,
    )
+    from transformers.pipelines import TranslationPipeline


 class ModelManagementTests(unittest.TestCase):
@ -189,6 +190,7 @@ class TestMarian_RU_FR(MarianIntegrationTest):
    src_text = ["Он показал мне рукопись своей новой пьесы."]
    expected_text = ["Il m'a montré le manuscrit de sa nouvelle pièce."]

+    @slow
    def test_batch_generation_ru_fr(self):
        self._assert_generated_batch_equal_expected()

@ -199,6 +201,7 @@ class TestMarian_MT_EN(MarianIntegrationTest):
    src_text = ["Billi messu b'mod ġentili, Ġesù fejjaq raġel li kien milqut bil - marda kerha tal - ġdiem."]
    expected_text = ["Touching gently, Jesus healed a man who was affected by the sad disease of leprosy."]

+    @slow
    def test_batch_generation_mt_en(self):
        self._assert_generated_batch_equal_expected()

@ -229,6 +232,11 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
        with self.assertRaises(ValueError):
            self.tokenizer.prepare_translation_batch([""])

+    def test_pipeline(self):
+        pipeline = TranslationPipeline(self.model, self.tokenizer, framework="pt")
+        output = pipeline(self.src_text)
+        self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
+

@require_torch
 class TestConversionUtils(unittest.TestCase):
--- a/tests/test_tokenization_marian.py
+++ b/tests/test_tokenization_marian.py
@ -52,8 +52,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        tokenizer.save_pretrained(self.tmpdirname)

    def get_tokenizer(self, max_len=None, **kwargs) -> MarianTokenizer:
-        # overwrite max_len=512 default
-        return MarianTokenizer.from_pretrained(self.tmpdirname, max_len=max_len, **kwargs)
+        return MarianTokenizer.from_pretrained(self.tmpdirname, model_max_length=max_len, **kwargs)

    def get_input_output_texts(self):
        return (