Correct & Improve Doctests for LayoutLMv2 (#17168)

* add inference example to LayoutLMv2ForQuestionAnswering, passing doctest

* add loss example to LayoutLMv2ForQuestionAnswering, passing doctest

* Add correct doctest for LayoutLMv2ForTokenClassification, passing doctest

* add correct doctest for LayoutLMv2ForSequenceClassification, passing test

* add correct doctest for LayoutLMv2Model, passing test

* make fixup

* fix to address review comments

* make style

* fix doctest line break issue, add to documentaiton_tests.txt, address review comments

* move comment about layoutlmv2 dependencies to the doc page

* format doc page as suggested

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>

* delete extraneous backtick

Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
ghlai9665 2022-05-23 07:02:31 -05:00 committed by GitHub
parent b48ac1a094
commit 7b8cb26953
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 113 additions and 39 deletions

View File

@ -44,6 +44,14 @@ including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.
RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
this https URL.*
LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the
following to install them:
```
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
python -m pip install torchvision tesseract
```
(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
Tips:
- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during

View File

@ -14,7 +14,6 @@
# limitations under the License.
""" PyTorch LayoutLMv2 model."""
import math
from typing import Optional, Tuple, Union
@ -821,24 +820,35 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPooling]:
r"""
Returns:
Return:
Examples:
```python
>>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
>>> from transformers import LayoutLMv2Processor, LayoutLMv2Model, set_seed
>>> from PIL import Image
>>> import torch
>>> from datasets import load_dataset
>>> set_seed(88)
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
>>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
>>> image_path = dataset["test"][0]["file"]
>>> image = Image.open(image_path).convert("RGB")
>>> encoding = processor(image, return_tensors="pt")
>>> outputs = model(**encoding)
>>> last_hidden_states = outputs.last_hidden_state
```"""
>>> last_hidden_states.shape
torch.Size([1, 342, 768])
```
"""
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@ -990,25 +1000,37 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
Returns:
Examples:
Example:
```python
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification, set_seed
>>> from PIL import Image
>>> import torch
>>> from datasets import load_dataset
>>> set_seed(88)
>>> dataset = load_dataset("rvl_cdip", split="train", streaming=True)
>>> data = next(iter(dataset))
>>> image = data["image"].convert("RGB")
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
>>> model = LayoutLMv2ForSequenceClassification.from_pretrained(
... "microsoft/layoutlmv2-base-uncased", num_labels=dataset.info.features["label"].num_classes
... )
>>> encoding = processor(image, return_tensors="pt")
>>> sequence_label = torch.tensor([1])
>>> sequence_label = torch.tensor([data["label"]])
>>> outputs = model(**encoding, labels=sequence_label)
>>> loss = outputs.loss
>>> logits = outputs.logits
```"""
>>> loss, logits = outputs.loss, outputs.logits
>>> predicted_idx = logits.argmax(dim=-1).item()
>>> predicted_answer = dataset.info.features["label"].names[4]
>>> predicted_idx, predicted_answer
(4, 'advertisement')
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1157,26 +1179,48 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
Returns:
Examples:
Example:
```python
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification, set_seed
>>> from PIL import Image
>>> from datasets import load_dataset
>>> set_seed(88)
>>> datasets = load_dataset("nielsr/funsd", split="test")
>>> labels = datasets.features["ner_tags"].feature.names
>>> id2label = {v: k for v, k in enumerate(labels)}
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
>>> model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2ForTokenClassification.from_pretrained(
... "microsoft/layoutlmv2-base-uncased", num_labels=len(labels)
... )
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
>>> words = ["hello", "world"]
>>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
>>> word_labels = [0, 1]
>>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
>>> data = datasets[0]
>>> image = Image.open(data["image_path"]).convert("RGB")
>>> words = data["words"]
>>> boxes = data["bboxes"] # make sure to normalize your bounding boxes
>>> word_labels = data["ner_tags"]
>>> encoding = processor(
... image,
... words,
... boxes=boxes,
... word_labels=word_labels,
... padding="max_length",
... truncation=True,
... return_tensors="pt",
... )
>>> outputs = model(**encoding)
>>> loss = outputs.loss
>>> logits = outputs.logits
```"""
>>> logits, loss = outputs.logits, outputs.loss
>>> predicted_token_class_ids = logits.argmax(-1)
>>> predicted_tokens_classes = [id2label[t.item()] for t in predicted_token_class_ids[0]]
>>> predicted_tokens_classes[:5]
['B-ANSWER', 'B-HEADER', 'B-HEADER', 'B-HEADER', 'B-HEADER']
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
@ -1273,28 +1317,49 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
Returns:
Examples:
Example:
In this example below, we give the LayoutLMv2 model an image (of texts) and ask it a question. It will give us
a prediction of what it thinks the answer is (the span of the answer within the texts parsed from the image).
```python
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
>>> from PIL import Image
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering, set_seed
>>> import torch
>>> from PIL import Image
>>> from datasets import load_dataset
>>> set_seed(88)
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
>>> question = "what's his name?"
>>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
>>> image_path = dataset["test"][0]["file"]
>>> image = Image.open(image_path).convert("RGB")
>>> question = "When is coffee break?"
>>> encoding = processor(image, question, return_tensors="pt")
>>> start_positions = torch.tensor([1])
>>> end_positions = torch.tensor([3])
>>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
>>> loss = outputs.loss
>>> start_scores = outputs.start_logits
>>> end_scores = outputs.end_logits
```"""
>>> outputs = model(**encoding)
>>> predicted_start_idx = outputs.start_logits.argmax(-1).item()
>>> predicted_end_idx = outputs.end_logits.argmax(-1).item()
>>> predicted_start_idx, predicted_end_idx
(154, 287)
>>> predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
>>> predicted_answer = processor.tokenizer.decode(predicted_answer_tokens)
>>> predicted_answer # results are not very good without further fine-tuning
'council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public ...
```
```python
>>> target_start_index = torch.tensor([7])
>>> target_end_index = torch.tensor([14])
>>> outputs = model(**encoding, start_positions=target_start_index, end_positions=target_end_index)
>>> predicted_answer_span_start = outputs.start_logits.argmax(-1).item()
>>> predicted_answer_span_end = outputs.end_logits.argmax(-1).item()
>>> predicted_answer_span_start, predicted_answer_span_end
(154, 287)
```
"""
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

View File

@ -32,6 +32,7 @@ src/transformers/models/glpn/modeling_glpn.py
src/transformers/models/gpt2/modeling_gpt2.py
src/transformers/models/gptj/modeling_gptj.py
src/transformers/models/hubert/modeling_hubert.py
src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
src/transformers/models/longformer/modeling_longformer.py
src/transformers/models/longformer/modeling_tf_longformer.py
src/transformers/models/marian/modeling_marian.py