Correct & Improve Doctests for LayoutLMv2 (#17168)
* add inference example to LayoutLMv2ForQuestionAnswering, passing doctest * add loss example to LayoutLMv2ForQuestionAnswering, passing doctest * Add correct doctest for LayoutLMv2ForTokenClassification, passing doctest * add correct doctest for LayoutLMv2ForSequenceClassification, passing test * add correct doctest for LayoutLMv2Model, passing test * make fixup * fix to address review comments * make style * fix doctest line break issue, add to documentaiton_tests.txt, address review comments * move comment about layoutlmv2 dependencies to the doc page * format doc page as suggested Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * delete extraneous backtick Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
b48ac1a094
commit
7b8cb26953
|
@ -44,6 +44,14 @@ including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.
|
|||
RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
|
||||
this https URL.*
|
||||
|
||||
LayoutLMv2 depends on `detectron2`, `torchvision` and `tesseract`. Run the
|
||||
following to install them:
|
||||
```
|
||||
python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
|
||||
python -m pip install torchvision tesseract
|
||||
```
|
||||
(If you are developing for LayoutLMv2, note that passing the doctests also requires the installation of these packages.)
|
||||
|
||||
Tips:
|
||||
|
||||
- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
|
||||
|
|
|
@ -14,7 +14,6 @@
|
|||
# limitations under the License.
|
||||
""" PyTorch LayoutLMv2 model."""
|
||||
|
||||
|
||||
import math
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
|
@ -821,24 +820,35 @@ class LayoutLMv2Model(LayoutLMv2PreTrainedModel):
|
|||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, BaseModelOutputWithPooling]:
|
||||
r"""
|
||||
Returns:
|
||||
Return:
|
||||
|
||||
Examples:
|
||||
|
||||
```python
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2Model
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2Model, set_seed
|
||||
>>> from PIL import Image
|
||||
>>> import torch
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> set_seed(88)
|
||||
|
||||
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
>>> model = LayoutLMv2Model.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
|
||||
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
|
||||
|
||||
>>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
|
||||
>>> image_path = dataset["test"][0]["file"]
|
||||
>>> image = Image.open(image_path).convert("RGB")
|
||||
|
||||
>>> encoding = processor(image, return_tensors="pt")
|
||||
|
||||
>>> outputs = model(**encoding)
|
||||
>>> last_hidden_states = outputs.last_hidden_state
|
||||
```"""
|
||||
|
||||
>>> last_hidden_states.shape
|
||||
torch.Size([1, 342, 768])
|
||||
```
|
||||
"""
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
|
@ -990,25 +1000,37 @@ class LayoutLMv2ForSequenceClassification(LayoutLMv2PreTrainedModel):
|
|||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForSequenceClassification, set_seed
|
||||
>>> from PIL import Image
|
||||
>>> import torch
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> set_seed(88)
|
||||
|
||||
>>> dataset = load_dataset("rvl_cdip", split="train", streaming=True)
|
||||
>>> data = next(iter(dataset))
|
||||
>>> image = data["image"].convert("RGB")
|
||||
|
||||
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
>>> model = LayoutLMv2ForSequenceClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
|
||||
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
|
||||
>>> model = LayoutLMv2ForSequenceClassification.from_pretrained(
|
||||
... "microsoft/layoutlmv2-base-uncased", num_labels=dataset.info.features["label"].num_classes
|
||||
... )
|
||||
|
||||
>>> encoding = processor(image, return_tensors="pt")
|
||||
>>> sequence_label = torch.tensor([1])
|
||||
>>> sequence_label = torch.tensor([data["label"]])
|
||||
|
||||
>>> outputs = model(**encoding, labels=sequence_label)
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
```"""
|
||||
|
||||
>>> loss, logits = outputs.loss, outputs.logits
|
||||
>>> predicted_idx = logits.argmax(dim=-1).item()
|
||||
>>> predicted_answer = dataset.info.features["label"].names[4]
|
||||
>>> predicted_idx, predicted_answer
|
||||
(4, 'advertisement')
|
||||
```
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -1157,26 +1179,48 @@ class LayoutLMv2ForTokenClassification(LayoutLMv2PreTrainedModel):
|
|||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification, set_seed
|
||||
>>> from PIL import Image
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> set_seed(88)
|
||||
|
||||
>>> datasets = load_dataset("nielsr/funsd", split="test")
|
||||
>>> labels = datasets.features["ner_tags"].feature.names
|
||||
>>> id2label = {v: k for v, k in enumerate(labels)}
|
||||
|
||||
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
|
||||
>>> model = LayoutLMv2ForTokenClassification.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
>>> model = LayoutLMv2ForTokenClassification.from_pretrained(
|
||||
... "microsoft/layoutlmv2-base-uncased", num_labels=len(labels)
|
||||
... )
|
||||
|
||||
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
|
||||
>>> words = ["hello", "world"]
|
||||
>>> boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
|
||||
>>> word_labels = [0, 1]
|
||||
|
||||
>>> encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
|
||||
>>> data = datasets[0]
|
||||
>>> image = Image.open(data["image_path"]).convert("RGB")
|
||||
>>> words = data["words"]
|
||||
>>> boxes = data["bboxes"] # make sure to normalize your bounding boxes
|
||||
>>> word_labels = data["ner_tags"]
|
||||
>>> encoding = processor(
|
||||
... image,
|
||||
... words,
|
||||
... boxes=boxes,
|
||||
... word_labels=word_labels,
|
||||
... padding="max_length",
|
||||
... truncation=True,
|
||||
... return_tensors="pt",
|
||||
... )
|
||||
|
||||
>>> outputs = model(**encoding)
|
||||
>>> loss = outputs.loss
|
||||
>>> logits = outputs.logits
|
||||
```"""
|
||||
>>> logits, loss = outputs.logits, outputs.loss
|
||||
|
||||
>>> predicted_token_class_ids = logits.argmax(-1)
|
||||
>>> predicted_tokens_classes = [id2label[t.item()] for t in predicted_token_class_ids[0]]
|
||||
>>> predicted_tokens_classes[:5]
|
||||
['B-ANSWER', 'B-HEADER', 'B-HEADER', 'B-HEADER', 'B-HEADER']
|
||||
```
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
@ -1273,28 +1317,49 @@ class LayoutLMv2ForQuestionAnswering(LayoutLMv2PreTrainedModel):
|
|||
|
||||
Returns:
|
||||
|
||||
Examples:
|
||||
Example:
|
||||
|
||||
In this example below, we give the LayoutLMv2 model an image (of texts) and ask it a question. It will give us
|
||||
a prediction of what it thinks the answer is (the span of the answer within the texts parsed from the image).
|
||||
|
||||
```python
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering
|
||||
>>> from PIL import Image
|
||||
>>> from transformers import LayoutLMv2Processor, LayoutLMv2ForQuestionAnswering, set_seed
|
||||
>>> import torch
|
||||
>>> from PIL import Image
|
||||
>>> from datasets import load_dataset
|
||||
|
||||
>>> set_seed(88)
|
||||
>>> processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
>>> model = LayoutLMv2ForQuestionAnswering.from_pretrained("microsoft/layoutlmv2-base-uncased")
|
||||
|
||||
>>> image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
|
||||
>>> question = "what's his name?"
|
||||
|
||||
>>> dataset = load_dataset("hf-internal-testing/fixtures_docvqa")
|
||||
>>> image_path = dataset["test"][0]["file"]
|
||||
>>> image = Image.open(image_path).convert("RGB")
|
||||
>>> question = "When is coffee break?"
|
||||
>>> encoding = processor(image, question, return_tensors="pt")
|
||||
>>> start_positions = torch.tensor([1])
|
||||
>>> end_positions = torch.tensor([3])
|
||||
|
||||
>>> outputs = model(**encoding, start_positions=start_positions, end_positions=end_positions)
|
||||
>>> loss = outputs.loss
|
||||
>>> start_scores = outputs.start_logits
|
||||
>>> end_scores = outputs.end_logits
|
||||
```"""
|
||||
>>> outputs = model(**encoding)
|
||||
>>> predicted_start_idx = outputs.start_logits.argmax(-1).item()
|
||||
>>> predicted_end_idx = outputs.end_logits.argmax(-1).item()
|
||||
>>> predicted_start_idx, predicted_end_idx
|
||||
(154, 287)
|
||||
|
||||
>>> predicted_answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + 1]
|
||||
>>> predicted_answer = processor.tokenizer.decode(predicted_answer_tokens)
|
||||
>>> predicted_answer # results are not very good without further fine-tuning
|
||||
'council mem - bers conducted by trrf treasurer philip g. kuehn to get answers which the public ...
|
||||
```
|
||||
|
||||
```python
|
||||
>>> target_start_index = torch.tensor([7])
|
||||
>>> target_end_index = torch.tensor([14])
|
||||
>>> outputs = model(**encoding, start_positions=target_start_index, end_positions=target_end_index)
|
||||
>>> predicted_answer_span_start = outputs.start_logits.argmax(-1).item()
|
||||
>>> predicted_answer_span_end = outputs.end_logits.argmax(-1).item()
|
||||
>>> predicted_answer_span_start, predicted_answer_span_end
|
||||
(154, 287)
|
||||
```
|
||||
"""
|
||||
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
|
|
|
@ -32,6 +32,7 @@ src/transformers/models/glpn/modeling_glpn.py
|
|||
src/transformers/models/gpt2/modeling_gpt2.py
|
||||
src/transformers/models/gptj/modeling_gptj.py
|
||||
src/transformers/models/hubert/modeling_hubert.py
|
||||
src/transformers/models/layoutlmv2/modeling_layoutlmv2.py
|
||||
src/transformers/models/longformer/modeling_longformer.py
|
||||
src/transformers/models/longformer/modeling_tf_longformer.py
|
||||
src/transformers/models/marian/modeling_marian.py
|
||||
|
|
Loading…
Reference in New Issue