From 7b8cb2695348ef2f8fe16988639517b7fb85c2cb Mon Sep 17 00:00:00 2001 From: ghlai9665 Date: Mon, 23 May 2022 07:02:31 -0500 Subject: [PATCH] Correct & Improve Doctests for LayoutLMv2 (#17168) * add inference example to LayoutLMv2ForQuestionAnswering, passing doctest * add loss example to LayoutLMv2ForQuestionAnswering, passing doctest * Add correct doctest for LayoutLMv2ForTokenClassification, passing doctest * add correct doctest for LayoutLMv2ForSequenceClassification, passing test * add correct doctest for LayoutLMv2Model, passing test * make fixup * fix to address review comments * make style * fix doctest line break issue, add to documentaiton_tests.txt, address review comments * move comment about layoutlmv2 dependencies to the doc page * format doc page as suggested Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * delete extraneous backtick Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> --- docs/source/en/model_doc/layoutlmv2.mdx | 8 + .../models/layoutlmv2/modeling_layoutlmv2.py | 143 +++++++++++++----- utils/documentation_tests.txt | 1 + 3 files changed, 113 insertions(+), 39 deletions(-) diff --git a/docs/source/en/model_doc/layoutlmv2.mdx b/docs/source/en/model_doc/layoutlmv2.mdx index 374cbcb775..e40a3cfc8d 100644 --- a/docs/source/en/model_doc/layoutlmv2.mdx +++ b/docs/source/en/model_doc/layoutlmv2.mdx @@ -44,6 +44,14 @@ including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0. RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at this https URL.* +LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the +following to install them: +``` +python -m pip install 'git+https://github.com/facebookresearch/detectron2.git' +python -m pip install torchvision tesseract +``` +(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.) + Tips: - The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during diff --git a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py index 269e951ea0..7faa34eec4 100755 --- a/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py +++ b/src/transformers/models/layoutlmv2/modeling_layoutlmv2.py @@ -14,7 +14,6 @@ # limitations under the License. """ PyTorch LayoutLMv2 model.""" - import math from typing import Optional, Tuple, Union @@ -821,24 +820,35 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel): return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" - Returns: + Return: Examples: ```python - >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model + >>> from transformers import LayoutLMv2Processor, LayoutLMv2Model, set_seed >>> from PIL import Image + >>> import torch + >>> from datasets import load_dataset + + >>> set_seed(88) >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") >>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased") - >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB") + + >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa") + >>> image_path = dataset["test"][0]["file"] + >>> image = Image.open(image_path).convert("RGB") >>> encoding = processor(image, return_tensors="pt") >>> outputs = model(**encoding) >>> last_hidden_states = outputs.last_hidden_state - ```""" + + >>> last_hidden_states.shape + torch.Size([1, 342, 768]) + ``` + """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states @@ -990,25 +1000,37 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel): Returns: - Examples: + Example: ```python - >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification + >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification, set_seed >>> from PIL import Image >>> import torch + >>> from datasets import load_dataset + + >>> set_seed(88) + + >>> dataset = load_dataset("rvl_cdip", split="train", streaming=True) + >>> data = next(iter(dataset)) + >>> image = data["image"].convert("RGB") >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") - >>> model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") - - >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB") + >>> model = LayoutLMv2ForSequenceClassification.from_pretrained( + ... "microsoft/layoutlmv2-base-uncased", num_labels=dataset.info.features["label"].num_classes + ... ) >>> encoding = processor(image, return_tensors="pt") - >>> sequence_label = torch.tensor([1]) + >>> sequence_label = torch.tensor([data["label"]]) >>> outputs = model(**encoding, labels=sequence_label) - >>> loss = outputs.loss - >>> logits = outputs.logits - ```""" + + >>> loss, logits = outputs.loss, outputs.logits + >>> predicted_idx = logits.argmax(dim=-1).item() + >>> predicted_answer = dataset.info.features["label"].names[4] + >>> predicted_idx, predicted_answer + (4, 'advertisement') + ``` + """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1157,26 +1179,48 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel): Returns: - Examples: + Example: ```python - >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification + >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification, set_seed >>> from PIL import Image + >>> from datasets import load_dataset + + >>> set_seed(88) + + >>> datasets = load_dataset("nielsr/funsd", split="test") + >>> labels = datasets.features["ner_tags"].feature.names + >>> id2label = {v: k for v, k in enumerate(labels)} >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr") - >>> model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased") + >>> model = LayoutLMv2ForTokenClassification.from_pretrained( + ... "microsoft/layoutlmv2-base-uncased", num_labels=len(labels) + ... ) - >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB") - >>> words = ["hello", "world"] - >>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes - >>> word_labels = [0, 1] - - >>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt") + >>> data = datasets[0] + >>> image = Image.open(data["image_path"]).convert("RGB") + >>> words = data["words"] + >>> boxes = data["bboxes"] # make sure to normalize your bounding boxes + >>> word_labels = data["ner_tags"] + >>> encoding = processor( + ... image, + ... words, + ... boxes=boxes, + ... word_labels=word_labels, + ... padding="max_length", + ... truncation=True, + ... return_tensors="pt", + ... ) >>> outputs = model(**encoding) - >>> loss = outputs.loss - >>> logits = outputs.logits - ```""" + >>> logits, loss = outputs.logits, outputs.loss + + >>> predicted_token_class_ids = logits.argmax(-1) + >>> predicted_tokens_classes = [id2label[t.item()] for t in predicted_token_class_ids[0]] + >>> predicted_tokens_classes[:5] + ['B-ANSWER', 'B-HEADER', 'B-HEADER', 'B-HEADER', 'B-HEADER'] + ``` + """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict @@ -1273,28 +1317,49 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel): Returns: - Examples: + Example: + + In this example below, we give the LayoutLMv2 model an image (of texts) and ask it a question. It will give us + a prediction of what it thinks the answer is (the span of the answer within the texts parsed from the image). ```python - >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering - >>> from PIL import Image + >>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering, set_seed >>> import torch + >>> from PIL import Image + >>> from datasets import load_dataset + >>> set_seed(88) >>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased") >>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased") - >>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB") - >>> question = "what's his name?" - + >>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa") + >>> image_path = dataset["test"][0]["file"] + >>> image = Image.open(image_path).convert("RGB") + >>> question = "When is coffee break?" >>> encoding = processor(image, question, return_tensors="pt") - >>> start_positions = torch.tensor([1]) - >>> end_positions = torch.tensor([3]) - >>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions) - >>> loss = outputs.loss - >>> start_scores = outputs.start_logits - >>> end_scores = outputs.end_logits - ```""" + >>> outputs = model(**encoding) + >>> predicted_start_idx = outputs.start_logits.argmax(-1).item() + >>> predicted_end_idx = outputs.end_logits.argmax(-1).item() + >>> predicted_start_idx, predicted_end_idx + (154, 287) + + >>> predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1] + >>> predicted_answer = processor.tokenizer.decode(predicted_answer_tokens) + >>> predicted_answer # results are not very good without further fine-tuning + 'council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public ... + ``` + + ```python + >>> target_start_index = torch.tensor([7]) + >>> target_end_index = torch.tensor([14]) + >>> outputs = model(**encoding, start_positions=target_start_index, end_positions=target_end_index) + >>> predicted_answer_span_start = outputs.start_logits.argmax(-1).item() + >>> predicted_answer_span_end = outputs.end_logits.argmax(-1).item() + >>> predicted_answer_span_start, predicted_answer_span_end + (154, 287) + ``` + """ return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/utils/documentation_tests.txt b/utils/documentation_tests.txt index 45a9eae973..ae0f39ac4f 100644 --- a/utils/documentation_tests.txt +++ b/utils/documentation_tests.txt @@ -32,6 +32,7 @@ src/transformers/models/glpn/modeling_glpn.py src/transformers/models/gpt2/modeling_gpt2.py src/transformers/models/gptj/modeling_gptj.py src/transformers/models/hubert/modeling_hubert.py +src/transformers/models/layoutlmv2/modeling_layoutlmv2.py src/transformers/models/longformer/modeling_longformer.py src/transformers/models/longformer/modeling_tf_longformer.py src/transformers/models/marian/modeling_marian.py