Implement multiple span support for DocumentQuestionAnswering (#19204)

* Implement multiple span support * Address comments * Add tests + fix bugs
2022-10-11 07:47:55 -07:00 · 2022-10-11 07:47:55 -07:00 · a3008c5a6d
parent ab856f68df
commit a3008c5a6d
2 changed files with 196 additions and 64 deletions
--- a/src/transformers/pipelines/document_question_answering.py
+++ b/src/transformers/pipelines/document_question_answering.py
@ -25,7 +25,7 @@ from ..utils import (
    is_vision_available,
    logging,
 )
-from .base import PIPELINE_INIT_ARGS, Pipeline
+from .base import PIPELINE_INIT_ARGS, ChunkPipeline
 from .question_answering import select_starts_ends


@ -49,7 +49,7 @@ logger = logging.get_logger(__name__)

 # normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py.
 # However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an
-# unecessary dependency.
+# unnecessary dependency.
 def normalize_box(box, width, height):
    return [
        int(1000 * (box[0] / width)),
@ -99,7 +99,7 @@ class ModelType(ExplicitEnum):


@add_end_docstrings(PIPELINE_INIT_ARGS)
-class DocumentQuestionAnsweringPipeline(Pipeline):
+class DocumentQuestionAnsweringPipeline(ChunkPipeline):
    # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
    """
    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
@ -234,6 +234,8 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
            - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
              `word_boxes`).
            - **answer** (`str`) -- The answer to the question.
+            - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
+            - **page** (`int`) -- The page of the answer
        """
        if isinstance(question, str):
            inputs = {"question": question, "image": image}
@ -243,7 +245,24 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
            inputs = image
        return super().__call__(inputs, **kwargs)

-    def preprocess(self, input, lang=None, tesseract_config=""):
+    def preprocess(
+        self,
+        input,
+        padding="do_not_pad",
+        doc_stride=None,
+        max_seq_len=None,
+        word_boxes: Tuple[str, List[float]] = None,
+        lang=None,
+        tesseract_config="",
+    ):
+        # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
+        # to support documents with enough tokens that overflow the model's window
+        if max_seq_len is None:
+            max_seq_len = self.tokenizer.model_max_length
+
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 256)
+
        image = None
        image_features = {}
        if input.get("image", None) is not None:
@ -291,9 +310,15 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
                ).input_ids,
                "return_dict_in_generate": True,
            }
-            p_mask = None
-            word_ids = None
-            words = None
+            yield {
+                **encoding,
+                "p_mask": None,
+                "word_ids": None,
+                "words": None,
+                "page": None,
+                "output_attentions": True,
+                "is_last": True,
+            }
        else:
            tokenizer_kwargs = {}
            if self.model_type == ModelType.LayoutLM:
@ -306,21 +331,15 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
                tokenizer_kwargs["boxes"] = [boxes]

            encoding = self.tokenizer(
+                padding=padding,
+                max_length=max_seq_len,
+                stride=doc_stride,
                return_token_type_ids=True,
-                return_tensors=self.framework,
-                # TODO: In a future PR, use these feature to handle sequences whose length is longer than
-                # the maximum allowed by the model. Currently, the tokenizer will produce a sequence that
-                # may be too long for the model to handle.
-                # truncation="only_second",
-                # return_overflowing_tokens=True,
+                truncation="only_second",
+                return_overflowing_tokens=True,
                **tokenizer_kwargs,
            )

-            if "pixel_values" in image_features:
-                encoding["image"] = image_features.pop("pixel_values")
-
-            # TODO: For now, this should always be num_spans == 1 given the flags we've passed in above, but the
-            # code is written to naturally handle multiple spans at the right time.
            num_spans = len(encoding["input_ids"])

            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
@ -328,6 +347,13 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
            # This logic mirrors the logic in the question_answering pipeline
            p_mask = [[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)]
            for span_idx in range(num_spans):
+                if self.framework == "pt":
+                    span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()}
+                    if "pixel_values" in image_features:
+                        span_encoding["image"] = image_features["pixel_values"]
+                else:
+                    raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+
                input_ids_span_idx = encoding["input_ids"][span_idx]
                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                if self.tokenizer.cls_token_id is not None:
@ -339,11 +365,10 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
                # for SEP tokens, and the word's bounding box for words in the original document.
                if "boxes" not in tokenizer_kwargs:
                    bbox = []
-                for batch_index in range(num_spans):
                    for input_id, sequence_id, word_id in zip(
-                        encoding.input_ids[batch_index],
-                        encoding.sequence_ids(batch_index),
-                        encoding.word_ids(batch_index),
+                        encoding.input_ids[span_idx],
+                        encoding.sequence_ids(span_idx),
+                        encoding.word_ids(span_idx),
                    ):
                        if sequence_id == 1:
                            bbox.append(boxes[word_id])
@ -352,41 +377,50 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
                        else:
                            bbox.append([0] * 4)

-                if self.framework == "tf":
+                    if self.framework == "pt":
+                        span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0)
+                    elif self.framework == "tf":
                        raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
-                elif self.framework == "pt":
-                    encoding["bbox"] = torch.tensor([bbox])
-
-            word_ids = [encoding.word_ids(i) for i in range(num_spans)]
-
-        return {**encoding, "p_mask": p_mask, "word_ids": word_ids, "words": words}
+                yield {
+                    **span_encoding,
+                    "p_mask": p_mask[span_idx],
+                    "word_ids": encoding.word_ids(span_idx),
+                    "words": words,
+                    "is_last": span_idx == num_spans - 1,
+                }

    def _forward(self, model_inputs):
        p_mask = model_inputs.pop("p_mask", None)
        word_ids = model_inputs.pop("word_ids", None)
        words = model_inputs.pop("words", None)
+        is_last = model_inputs.pop("is_last", False)
+
+        if "overflow_to_sample_mapping" in model_inputs:
+            model_inputs.pop("overflow_to_sample_mapping")

        if self.model_type == ModelType.VisionEncoderDecoder:
            model_outputs = self.model.generate(**model_inputs)
        else:
            model_outputs = self.model(**model_inputs)

+        model_outputs = {k: v for (k, v) in model_outputs.items()}
        model_outputs["p_mask"] = p_mask
        model_outputs["word_ids"] = word_ids
        model_outputs["words"] = words
        model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
+        model_outputs["is_last"] = is_last
        return model_outputs

    def postprocess(self, model_outputs, top_k=1, **kwargs):
        if self.model_type == ModelType.VisionEncoderDecoder:
-            answers = self.postprocess_donut(model_outputs)
+            answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
        else:
            answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)

        answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
        return answers

-    def postprocess_donut(self, model_outputs, **kwargs):
+    def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
        sequence = self.tokenizer.batch_decode(model_outputs.sequences)[0]

        # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
@ -400,36 +434,35 @@ class DocumentQuestionAnsweringPipeline(Pipeline):
        answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
        if answer is not None:
            ret["answer"] = answer.group(1).strip()
-        return [ret]
+        return ret

    def postprocess_extractive_qa(
        self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
    ):
        min_null_score = 1000000  # large and positive
        answers = []
-        words = model_outputs["words"]
+        for output in model_outputs:
+            words = output["words"]

-        # TODO: Currently, we expect the length of model_outputs to be 1, because we do not stride
-        # in the preprocessor code. When we implement that, we'll either need to handle tensors of size
-        # > 1 or use the ChunkPipeline and handle multiple outputs (each of size = 1).
            starts, ends, scores, min_null_score = select_starts_ends(
-            model_outputs["start_logits"],
-            model_outputs["end_logits"],
-            model_outputs["p_mask"],
-            model_outputs["attention_mask"].numpy() if model_outputs.get("attention_mask", None) is not None else None,
-            min_null_score,
-            top_k,
-            handle_impossible_answer,
-            max_answer_len,
+                start=output["start_logits"],
+                end=output["end_logits"],
+                p_mask=output["p_mask"],
+                attention_mask=output["attention_mask"].numpy()
+                if output.get("attention_mask", None) is not None
+                else None,
+                min_null_score=min_null_score,
+                top_k=top_k,
+                handle_impossible_answer=handle_impossible_answer,
+                max_answer_len=max_answer_len,
            )
-
-        word_ids = model_outputs["word_ids"][0]
-        for start, eend, score in zip(starts, ends, scores):
-            word_start, word_end = word_ids[start], word_ids[eend]
+            word_ids = output["word_ids"]
+            for start, end, score in zip(starts, ends, scores):
+                word_start, word_end = word_ids[start], word_ids[end]
                if word_start is not None and word_end is not None:
                    answers.append(
                        {
-                        "score": float(score),  # XXX Write a test that verifies the result is JSON-serializable
+                            "score": float(score),
                            "answer": " ".join(words[word_start : word_end + 1]),
                            "start": word_start,
                            "end": word_end,
--- a/tests/pipelines/test_pipelines_document_question_answering.py
+++ b/tests/pipelines/test_pipelines_document_question_answering.py
@ -191,6 +191,52 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=Pipeli
            * 2,
        )

+    @slow
+    @require_torch
+    @require_detectron2
+    @require_pytesseract
+    def test_large_model_pt_chunk(self):
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa",
+            revision="9977165",
+            max_seq_len=50,
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9967, "answer": "1102/2019", "start": 22, "end": 22},
+                {"score": 0.996, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+
+        outputs = dqa_pipeline({"image": image, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9967, "answer": "1102/2019", "start": 22, "end": 22},
+                {"score": 0.996, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9967, "answer": "1102/2019", "start": 22, "end": 22},
+                    {"score": 0.996, "answer": "us-001", "start": 15, "end": 15},
+                ]
+            ]
+            * 2,
+        )
+
    @slow
    @require_torch
    @require_pytesseract
@ -252,6 +298,59 @@ class DocumentQuestionAnsweringPipelineTests(unittest.TestCase, metaclass=Pipeli
            ],
        )

+    @slow
+    @require_torch
+    @require_pytesseract
+    @require_vision
+    def test_large_model_pt_layoutlm_chunk(self):
+        tokenizer = AutoTokenizer.from_pretrained(
+            "impira/layoutlm-document-qa", revision="3dc6de3", add_prefix_space=True
+        )
+        dqa_pipeline = pipeline(
+            "document-question-answering",
+            model="impira/layoutlm-document-qa",
+            tokenizer=tokenizer,
+            revision="3dc6de3",
+            max_seq_len=50,
+        )
+        image = INVOICE_URL
+        question = "What is the invoice number?"
+
+        outputs = dqa_pipeline(image=image, question=question, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9999, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.9924, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+
+        outputs = dqa_pipeline(
+            [{"image": image, "question": question}, {"image": image, "question": question}], top_k=2
+        )
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                [
+                    {"score": 0.9999, "answer": "us-001", "start": 15, "end": 15},
+                    {"score": 0.9924, "answer": "us-001", "start": 15, "end": 15},
+                ]
+            ]
+            * 2,
+        )
+
+        word_boxes = list(zip(*apply_tesseract(load_image(image), None, "")))
+
+        # This model should also work if `image` is set to None
+        outputs = dqa_pipeline({"image": None, "word_boxes": word_boxes, "question": question}, top_k=2)
+        self.assertEqual(
+            nested_simplify(outputs, decimals=4),
+            [
+                {"score": 0.9999, "answer": "us-001", "start": 15, "end": 15},
+                {"score": 0.9924, "answer": "us-001", "start": 15, "end": 15},
+            ],
+        )
+
    @slow
    @require_torch
    def test_large_model_pt_donut(self):