[Data2Vec] Add data2vec vision (#16760)
* save intermediate * add vision * add vision * save * finish models * finish models * continue * finish * up * up * up * tests all pass * clean up * up * up * fix bugs in beit * correct docs * finish * finish docs * make style * up * more fixes * fix type hint * make style * Apply suggestions from code review Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update tests/data2vec/test_modeling_data2vec_vision.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * fix test Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
This commit is contained in:
parent
33cd4be576
commit
8d3f952adb
|
@ -190,6 +190,7 @@ Flax), PyTorch, and/or TensorFlow.
|
|||
| CTRL | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Data2VecAudio | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecText | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| Data2VecVision | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
| DeBERTa | ✅ | ✅ | ✅ | ✅ | ❌ |
|
||||
| DeBERTa-v2 | ✅ | ❌ | ✅ | ✅ | ❌ |
|
||||
| Decision Transformer | ❌ | ❌ | ✅ | ❌ | ❌ |
|
||||
|
|
|
@ -33,10 +33,13 @@ Models and code are available at www.github.com/pytorch/fairseq/tree/master/exam
|
|||
|
||||
Tips:
|
||||
|
||||
- Both Data2VecAudio and Data2VecText have been trained using the same self-supervised learning method.
|
||||
In the case of Data2VecAudio, preprocessing is identical to [`RobertaModel`], including tokenization.
|
||||
- Data2VecAudio, Data2VecText, and Data2VecVision have all been trained using the same self-supervised learning method.
|
||||
- For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
|
||||
- For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
|
||||
- For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
|
||||
|
||||
This model was contributed by [edugp](https://huggingface.co/edugp) and [patrickvonplaten](https://huggingface.co/patrickvonplaten)
|
||||
|
||||
This model was contributed by [edugp](https://huggingface.co/edugp).
|
||||
The original code can be found [here](https://github.com/pytorch/fairseq/tree/main/examples/data2vec).
|
||||
|
||||
|
||||
|
@ -48,12 +51,16 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
|
|||
|
||||
[[autodoc]] Data2VecAudioConfig
|
||||
|
||||
## Data2VecVisionConfig
|
||||
|
||||
[[autodoc]] Data2VecVisionConfig
|
||||
|
||||
|
||||
## Data2VecAudioModel
|
||||
|
||||
[[autodoc]] Data2VecAudioModel
|
||||
- forward
|
||||
|
||||
|
||||
## Data2VecAudioForAudioFrameClassification
|
||||
|
||||
[[autodoc]] Data2VecAudioForAudioFrameClassification
|
||||
|
@ -108,3 +115,18 @@ The original code can be found [here](https://github.com/pytorch/fairseq/tree/ma
|
|||
|
||||
[[autodoc]] Data2VecTextForQuestionAnswering
|
||||
- forward
|
||||
|
||||
## Data2VecVisionModel
|
||||
|
||||
[[autodoc]] Data2VecVisionModel
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForImageClassification
|
||||
|
||||
[[autodoc]] Data2VecVisionForImageClassification
|
||||
- forward
|
||||
|
||||
## Data2VecVisionForSemanticSegmentation
|
||||
|
||||
[[autodoc]] Data2VecVisionForSemanticSegmentation
|
||||
- forward
|
||||
|
|
|
@ -54,6 +54,7 @@ Ready-made configurations include the following architectures:
|
|||
- BlenderbotSmall
|
||||
- CamemBERT
|
||||
- Data2VecText
|
||||
- Data2VecVision
|
||||
- DistilBERT
|
||||
- ELECTRA
|
||||
- FlauBERT
|
||||
|
|
|
@ -170,7 +170,13 @@ _import_structure = {
|
|||
"models.convnext": ["CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ConvNextConfig"],
|
||||
"models.cpm": ["CpmTokenizer"],
|
||||
"models.ctrl": ["CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP", "CTRLConfig", "CTRLTokenizer"],
|
||||
"models.data2vec": ["DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP", "Data2VecAudioConfig", "Data2VecTextConfig"],
|
||||
"models.data2vec": [
|
||||
"DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
"DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
"Data2VecAudioConfig",
|
||||
"Data2VecTextConfig",
|
||||
"Data2VecVisionConfig",
|
||||
],
|
||||
"models.deberta": ["DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaConfig", "DebertaTokenizer"],
|
||||
"models.deberta_v2": ["DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP", "DebertaV2Config"],
|
||||
"models.decision_transformer": ["DECISION_TRANSFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "DecisionTransformerConfig"],
|
||||
|
@ -868,6 +874,7 @@ if is_torch_available():
|
|||
[
|
||||
"DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Data2VecAudioForAudioFrameClassification",
|
||||
"Data2VecAudioForCTC",
|
||||
"Data2VecAudioForSequenceClassification",
|
||||
|
@ -882,6 +889,10 @@ if is_torch_available():
|
|||
"Data2VecTextForTokenClassification",
|
||||
"Data2VecTextModel",
|
||||
"Data2VecTextPreTrainedModel",
|
||||
"Data2VecVisionForImageClassification",
|
||||
"Data2VecVisionForSemanticSegmentation",
|
||||
"Data2VecVisionModel",
|
||||
"Data2VecVisionPreTrainedModel",
|
||||
]
|
||||
)
|
||||
_import_structure["models.deberta"].extend(
|
||||
|
@ -2555,7 +2566,13 @@ if TYPE_CHECKING:
|
|||
from .models.convnext import CONVNEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, ConvNextConfig
|
||||
from .models.cpm import CpmTokenizer
|
||||
from .models.ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP, CTRLConfig, CTRLTokenizer
|
||||
from .models.data2vec import DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig, Data2VecTextConfig
|
||||
from .models.data2vec import (
|
||||
DATA2VEC_TEXT_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
Data2VecAudioConfig,
|
||||
Data2VecTextConfig,
|
||||
Data2VecVisionConfig,
|
||||
)
|
||||
from .models.deberta import DEBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaConfig, DebertaTokenizer
|
||||
from .models.deberta_v2 import DEBERTA_V2_PRETRAINED_CONFIG_ARCHIVE_MAP, DebertaV2Config
|
||||
from .models.decision_transformer import (
|
||||
|
@ -3151,6 +3168,7 @@ if TYPE_CHECKING:
|
|||
from .models.data2vec import (
|
||||
DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Data2VecAudioForAudioFrameClassification,
|
||||
Data2VecAudioForCTC,
|
||||
Data2VecAudioForSequenceClassification,
|
||||
|
@ -3165,6 +3183,10 @@ if TYPE_CHECKING:
|
|||
Data2VecTextForTokenClassification,
|
||||
Data2VecTextModel,
|
||||
Data2VecTextPreTrainedModel,
|
||||
Data2VecVisionForImageClassification,
|
||||
Data2VecVisionForSemanticSegmentation,
|
||||
Data2VecVisionModel,
|
||||
Data2VecVisionPreTrainedModel,
|
||||
)
|
||||
from .models.deberta import (
|
||||
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
|
|
|
@ -59,6 +59,7 @@ CONFIG_MAPPING_NAMES = OrderedDict(
|
|||
("layoutlmv2", "LayoutLMv2Config"),
|
||||
("plbart", "PLBartConfig"),
|
||||
("beit", "BeitConfig"),
|
||||
("data2vec-vision", "Data2VecVisionConfig"),
|
||||
("rembert", "RemBertConfig"),
|
||||
("visual_bert", "VisualBertConfig"),
|
||||
("canine", "CanineConfig"),
|
||||
|
@ -162,6 +163,7 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
|
|||
("layoutlmv2", "LAYOUTLMV2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("plbart", "PLBART_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("beit", "BEIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("data2vec-vision", "DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("rembert", "REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("visual_bert", "VISUAL_BERT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
("canine", "CANINE_PRETRAINED_CONFIG_ARCHIVE_MAP"),
|
||||
|
@ -349,12 +351,18 @@ MODEL_NAMES_MAPPING = OrderedDict(
|
|||
("layoutxlm", "LayoutXLM"),
|
||||
("data2vec-audio", "Data2VecAudio"),
|
||||
("data2vec-text", "Data2VecText"),
|
||||
("data2vec-vision", "Data2VecVision"),
|
||||
("dit", "DiT"),
|
||||
]
|
||||
)
|
||||
|
||||
SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict(
|
||||
[("openai-gpt", "openai"), ("data2vec-audio", "data2vec"), ("data2vec-text", "data2vec")]
|
||||
[
|
||||
("openai-gpt", "openai"),
|
||||
("data2vec-audio", "data2vec"),
|
||||
("data2vec-text", "data2vec"),
|
||||
("data2vec-vision", "data2vec"),
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
|
|
|
@ -55,6 +55,7 @@ MODEL_MAPPING_NAMES = OrderedDict(
|
|||
("layoutlmv2", "LayoutLMv2Model"),
|
||||
("plbart", "PLBartModel"),
|
||||
("beit", "BeitModel"),
|
||||
("data2vec-vision", "Data2VecVisionModel"),
|
||||
("rembert", "RemBertModel"),
|
||||
("visual_bert", "VisualBertModel"),
|
||||
("canine", "CanineModel"),
|
||||
|
@ -290,6 +291,7 @@ MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
|
|||
("vit", "ViTForImageClassification"),
|
||||
("deit", ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher")),
|
||||
("beit", "BeitForImageClassification"),
|
||||
("data2vec-vision", "Data2VecVisionForImageClassification"),
|
||||
("segformer", "SegformerForImageClassification"),
|
||||
("imagegpt", "ImageGPTForImageClassification"),
|
||||
(
|
||||
|
@ -321,6 +323,7 @@ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict(
|
|||
[
|
||||
# Model for Semantic Segmentation mapping
|
||||
("beit", "BeitForSemanticSegmentation"),
|
||||
("data2vec-vision", "Data2VecVisionForSemanticSegmentation"),
|
||||
("segformer", "SegformerForSemanticSegmentation"),
|
||||
("dpt", "DPTForSemanticSegmentation"),
|
||||
]
|
||||
|
|
|
@ -702,7 +702,8 @@ class BeitModel(BeitPreTrainedModel):
|
|||
pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
|
||||
|
||||
if not return_dict:
|
||||
return (sequence_output, pooled_output) + encoder_outputs[1:]
|
||||
head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
|
||||
return head_outputs + encoder_outputs[1:]
|
||||
|
||||
return BeitModelOutputWithPooling(
|
||||
last_hidden_state=sequence_output,
|
||||
|
@ -713,7 +714,7 @@ class BeitModel(BeitPreTrainedModel):
|
|||
|
||||
|
||||
class BeitPooler(nn.Module):
|
||||
def __init__(self, config: BeitModel) -> None:
|
||||
def __init__(self, config: BeitConfig) -> None:
|
||||
super().__init__()
|
||||
self.layernorm = (
|
||||
nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None
|
||||
|
@ -736,7 +737,7 @@ class BeitPooler(nn.Module):
|
|||
BEIT_START_DOCSTRING,
|
||||
)
|
||||
class BeitForMaskedImageModeling(BeitPreTrainedModel):
|
||||
def __init__(self, config: BeitModel) -> None:
|
||||
def __init__(self, config: BeitConfig) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -817,7 +818,7 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
|
|||
masked_lm_loss = loss_fct(prediction_scores[bool_masked_pos], labels)
|
||||
|
||||
if not return_dict:
|
||||
output = (prediction_scores,) + outputs[2:]
|
||||
output = (prediction_scores,) + outputs[1:]
|
||||
return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
|
||||
|
||||
return MaskedLMOutput(
|
||||
|
@ -836,7 +837,7 @@ class BeitForMaskedImageModeling(BeitPreTrainedModel):
|
|||
BEIT_START_DOCSTRING,
|
||||
)
|
||||
class BeitForImageClassification(BeitPreTrainedModel):
|
||||
def __init__(self, config: BeitModel) -> None:
|
||||
def __init__(self, config: BeitConfig) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
self.num_labels = config.num_labels
|
||||
|
@ -1237,7 +1238,7 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
|
|||
return_dict=return_dict,
|
||||
)
|
||||
|
||||
encoder_hidden_states = outputs.hidden_states if return_dict else outputs[2]
|
||||
encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
|
||||
|
||||
# only keep certain features, and reshape
|
||||
# note that we do +1 as the encoder_hidden_states also includes the initial embeddings
|
||||
|
@ -1268,9 +1269,9 @@ class BeitForSemanticSegmentation(BeitPreTrainedModel):
|
|||
|
||||
if not return_dict:
|
||||
if output_hidden_states:
|
||||
output = (logits,) + outputs[2:]
|
||||
output = (logits,) + outputs[1:]
|
||||
else:
|
||||
output = (logits,) + outputs[3:]
|
||||
output = (logits,) + outputs[2:]
|
||||
return ((loss,) + output) if loss is not None else output
|
||||
|
||||
return SemanticSegmenterOutput(
|
||||
|
|
|
@ -31,6 +31,11 @@ _import_structure = {
|
|||
"Data2VecTextConfig",
|
||||
"Data2VecTextOnnxConfig",
|
||||
],
|
||||
"configuration_data2vec_vision": [
|
||||
"DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP",
|
||||
"Data2VecVisionConfig",
|
||||
"Data2VecVisionOnnxConfig",
|
||||
],
|
||||
}
|
||||
|
||||
if is_torch_available():
|
||||
|
@ -54,6 +59,14 @@ if is_torch_available():
|
|||
"Data2VecTextModel",
|
||||
"Data2VecTextPreTrainedModel",
|
||||
]
|
||||
_import_structure["modeling_data2vec_vision"] = [
|
||||
"DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST",
|
||||
"Data2VecVisionForImageClassification",
|
||||
"Data2VecVisionForMaskedImageModeling",
|
||||
"Data2VecVisionForSemanticSegmentation",
|
||||
"Data2VecVisionModel",
|
||||
"Data2VecVisionPreTrainedModel",
|
||||
]
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .configuration_data2vec_audio import DATA2VEC_AUDIO_PRETRAINED_CONFIG_ARCHIVE_MAP, Data2VecAudioConfig
|
||||
|
@ -62,6 +75,11 @@ if TYPE_CHECKING:
|
|||
Data2VecTextConfig,
|
||||
Data2VecTextOnnxConfig,
|
||||
)
|
||||
from .configuration_data2vec_vision import (
|
||||
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP,
|
||||
Data2VecVisionConfig,
|
||||
Data2VecVisionOnnxConfig,
|
||||
)
|
||||
|
||||
if is_torch_available():
|
||||
from .modeling_data2vec_audio import (
|
||||
|
@ -84,6 +102,14 @@ if TYPE_CHECKING:
|
|||
Data2VecTextModel,
|
||||
Data2VecTextPreTrainedModel,
|
||||
)
|
||||
from .modeling_data2vec_vision import (
|
||||
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
Data2VecVisionForImageClassification,
|
||||
Data2VecVisionForMaskedImageModeling,
|
||||
Data2VecVisionForSemanticSegmentation,
|
||||
Data2VecVisionModel,
|
||||
Data2VecVisionPreTrainedModel,
|
||||
)
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
|
|
@ -0,0 +1,200 @@
|
|||
# coding=utf-8
|
||||
# Copyright Meta Platforms and The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Data2VecVision model configuration"""
|
||||
from collections import OrderedDict
|
||||
from typing import Mapping
|
||||
|
||||
from packaging import version
|
||||
|
||||
from ...configuration_utils import PretrainedConfig
|
||||
from ...onnx import OnnxConfig
|
||||
from ...utils import logging
|
||||
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
DATA2VEC_VISION_PRETRAINED_CONFIG_ARCHIVE_MAP = {
|
||||
"facebook/data2vec-vision-base-ft": "https://huggingface.co/facebook/data2vec-vision-base-ft/resolve/main/config.json",
|
||||
}
|
||||
|
||||
|
||||
class Data2VecVisionConfig(PretrainedConfig):
|
||||
r"""
|
||||
This is the configuration class to store the configuration of a [`Data2VecVisionModel`]. It is used to instantiate
|
||||
an Data2VecVision model according to the specified arguments, defining the model architecture. Instantiating a
|
||||
configuration with the defaults will yield a similar configuration to that of the Data2VecVision
|
||||
[facebook/data2vec-vision-base](https://huggingface.co/facebook/data2vec-vision-base) architecture.
|
||||
|
||||
Args:
|
||||
vocab_size (`int`, *optional*, defaults to 8092):
|
||||
Vocabulary size of the Data2VecVision model. Defines the number of different image tokens that can be used
|
||||
during pre-training.
|
||||
hidden_size (`int`, *optional*, defaults to 768):
|
||||
Dimensionality of the encoder layers and the pooler layer.
|
||||
num_hidden_layers (`int`, *optional*, defaults to 12):
|
||||
Number of hidden layers in the Transformer encoder.
|
||||
num_attention_heads (`int`, *optional*, defaults to 12):
|
||||
Number of attention heads for each attention layer in the Transformer encoder.
|
||||
intermediate_size (`int`, *optional*, defaults to 3072):
|
||||
Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
|
||||
hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
|
||||
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
|
||||
`"relu"`, `"selu"` and `"gelu_new"` are supported.
|
||||
hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
|
||||
attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
|
||||
The dropout ratio for the attention probabilities.
|
||||
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||
layer_norm_eps (`float`, *optional*, defaults to 1e-12):
|
||||
The epsilon used by the layer normalization layers.
|
||||
image_size (`int`, *optional*, defaults to 224):
|
||||
The size (resolution) of each image.
|
||||
patch_size (`int`, *optional*, defaults to 16):
|
||||
The size (resolution) of each patch.
|
||||
num_channels (`int`, *optional*, defaults to 3):
|
||||
The number of input channels.
|
||||
use_mask_token (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use a mask token for masked image modeling.
|
||||
use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use BERT-style absolute position embeddings.
|
||||
use_relative_position_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use T5-style relative position embeddings in the self-attention layers.
|
||||
use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use the same relative position embeddings across all self-attention layers of the Transformer.
|
||||
layer_scale_init_value (`float`, *optional*, defaults to 0.1):
|
||||
Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale.
|
||||
drop_path_rate (`float`, *optional*, defaults to 0.1):
|
||||
Stochastic depth rate per sample (when applied in the main path of residual layers).
|
||||
use_mean_pooling (`bool`, *optional*, defaults to `True`):
|
||||
Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the
|
||||
CLS token, before applying the classification head.
|
||||
out_indices (`List[int]`, *optional*, defaults to `[3, 5, 7, 11]`):
|
||||
Indices of the feature maps to use for semantic segmentation.
|
||||
pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`):
|
||||
Pooling scales used in Pooling Pyramid Module applied on the last feature map.
|
||||
use_auxiliary_head (`bool`, *optional*, defaults to `True`):
|
||||
Whether to use an auxiliary head during training.
|
||||
auxiliary_loss_weight (`float`, *optional*, defaults to 0.4):
|
||||
Weight of the cross-entropy loss of the auxiliary head.
|
||||
auxiliary_channels (`int`, *optional*, defaults to 256):
|
||||
Number of channels to use in the auxiliary head.
|
||||
auxiliary_num_convs (`int`, *optional*, defaults to 1):
|
||||
Number of convolutional layers to use in the auxiliary head.
|
||||
auxiliary_concat_input (`bool`, *optional*, defaults to `False`):
|
||||
Whether to concatenate the output of the auxiliary head with the input before the classification layer.
|
||||
semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
|
||||
The index that is ignored by the loss function of the semantic segmentation model.
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import Data2VecVisionModel, Data2VecVisionConfig
|
||||
|
||||
>>> # Initializing a Data2VecVision data2vec_vision-base-patch16-224-in22k style configuration
|
||||
>>> configuration = Data2VecVisionConfig()
|
||||
|
||||
>>> # Initializing a model from the data2vec_vision-base-patch16-224-in22k style configuration
|
||||
>>> model = Data2VecVisionModel(configuration)
|
||||
|
||||
>>> # Accessing the model configuration
|
||||
>>> configuration = model.config
|
||||
```"""
|
||||
model_type = "data2vec-vision"
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size=8192,
|
||||
hidden_size=768,
|
||||
num_hidden_layers=12,
|
||||
num_attention_heads=12,
|
||||
intermediate_size=3072,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.0,
|
||||
attention_probs_dropout_prob=0.0,
|
||||
initializer_range=0.02,
|
||||
layer_norm_eps=1e-12,
|
||||
is_encoder_decoder=False,
|
||||
image_size=224,
|
||||
patch_size=16,
|
||||
num_channels=3,
|
||||
use_mask_token=False,
|
||||
use_absolute_position_embeddings=False,
|
||||
use_relative_position_bias=False,
|
||||
use_shared_relative_position_bias=False,
|
||||
layer_scale_init_value=0.1,
|
||||
drop_path_rate=0.1,
|
||||
use_mean_pooling=True,
|
||||
out_indices=[3, 5, 7, 11],
|
||||
pool_scales=[1, 2, 3, 6],
|
||||
use_auxiliary_head=True,
|
||||
auxiliary_loss_weight=0.4,
|
||||
auxiliary_channels=256,
|
||||
auxiliary_num_convs=1,
|
||||
auxiliary_concat_input=False,
|
||||
semantic_loss_ignore_index=255,
|
||||
**kwargs
|
||||
):
|
||||
super().__init__(**kwargs)
|
||||
|
||||
self.vocab_size = vocab_size
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.initializer_range = initializer_range
|
||||
self.layer_norm_eps = layer_norm_eps
|
||||
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.use_mask_token = use_mask_token
|
||||
self.use_absolute_position_embeddings = use_absolute_position_embeddings
|
||||
self.use_relative_position_bias = use_relative_position_bias
|
||||
self.use_shared_relative_position_bias = use_shared_relative_position_bias
|
||||
self.layer_scale_init_value = layer_scale_init_value
|
||||
self.drop_path_rate = drop_path_rate
|
||||
self.use_mean_pooling = use_mean_pooling
|
||||
# decode head attributes (semantic segmentation)
|
||||
self.out_indices = out_indices
|
||||
self.pool_scales = pool_scales
|
||||
# auxiliary head attributes (semantic segmentation)
|
||||
self.use_auxiliary_head = use_auxiliary_head
|
||||
self.auxiliary_loss_weight = auxiliary_loss_weight
|
||||
self.auxiliary_channels = auxiliary_channels
|
||||
self.auxiliary_num_convs = auxiliary_num_convs
|
||||
self.auxiliary_concat_input = auxiliary_concat_input
|
||||
self.semantic_loss_ignore_index = semantic_loss_ignore_index
|
||||
|
||||
|
||||
# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig
|
||||
class Data2VecVisionOnnxConfig(OnnxConfig):
|
||||
|
||||
torch_onnx_minimum_version = version.parse("1.11")
|
||||
|
||||
@property
|
||||
def inputs(self) -> Mapping[str, Mapping[int, str]]:
|
||||
return OrderedDict(
|
||||
[
|
||||
("pixel_values", {0: "batch", 1: "sequence"}),
|
||||
]
|
||||
)
|
||||
|
||||
@property
|
||||
def atol_for_validation(self) -> float:
|
||||
return 1e-4
|
|
@ -0,0 +1,374 @@
|
|||
#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from huggingface_hub import hf_hub_download
|
||||
from timm.models import create_model
|
||||
from transformers import (
|
||||
BeitFeatureExtractor,
|
||||
Data2VecVisionConfig,
|
||||
Data2VecVisionForImageClassification,
|
||||
Data2VecVisionModel,
|
||||
)
|
||||
|
||||
|
||||
def create_rename_keys(config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec."):
|
||||
prefix = "backbone." if is_semantic else ""
|
||||
|
||||
rename_keys = []
|
||||
for i in range(config.num_hidden_layers):
|
||||
# encoder layers: output projection, 2 feedforward neural networks and 2 layernorms
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.norm1.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_before.weight")
|
||||
)
|
||||
rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_before.bias"))
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.attn.proj.weight", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.weight")
|
||||
)
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.attn.proj.bias", f"{hf_prefix}encoder.layer.{i}.attention.output.dense.bias")
|
||||
)
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.norm2.weight", f"{hf_prefix}encoder.layer.{i}.layernorm_after.weight")
|
||||
)
|
||||
rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"{hf_prefix}encoder.layer.{i}.layernorm_after.bias"))
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.mlp.fc1.weight", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.weight")
|
||||
)
|
||||
rename_keys.append(
|
||||
(f"{prefix}blocks.{i}.mlp.fc1.bias", f"{hf_prefix}encoder.layer.{i}.intermediate.dense.bias")
|
||||
)
|
||||
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"{hf_prefix}encoder.layer.{i}.output.dense.weight"))
|
||||
rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"{hf_prefix}encoder.layer.{i}.output.dense.bias"))
|
||||
|
||||
# projection layer + position embeddings
|
||||
rename_keys.extend(
|
||||
[
|
||||
(f"{prefix}cls_token", f"{hf_prefix}embeddings.cls_token"),
|
||||
(f"{prefix}patch_embed.proj.weight", f"{hf_prefix}embeddings.patch_embeddings.projection.weight"),
|
||||
(f"{prefix}patch_embed.proj.bias", f"{hf_prefix}embeddings.patch_embeddings.projection.bias"),
|
||||
]
|
||||
)
|
||||
|
||||
if has_lm_head:
|
||||
# mask token + shared relative position bias + layernorm
|
||||
rename_keys.extend(
|
||||
[
|
||||
("mask_token", f"{hf_prefix}embeddings.mask_token"),
|
||||
(
|
||||
"rel_pos_bias.relative_position_bias_table",
|
||||
f"{hf_prefix}encoder.relative_position_bias.relative_position_bias_table",
|
||||
),
|
||||
(
|
||||
"rel_pos_bias.relative_position_index",
|
||||
f"{hf_prefix}encoder.relative_position_bias.relative_position_index",
|
||||
),
|
||||
("norm.weight", "layernorm.weight"),
|
||||
("norm.bias", "layernorm.bias"),
|
||||
]
|
||||
)
|
||||
elif is_semantic:
|
||||
# semantic segmentation classification heads
|
||||
rename_keys.extend(
|
||||
[
|
||||
("decode_head.conv_seg.weight", "decode_head.classifier.weight"),
|
||||
("decode_head.conv_seg.bias", "decode_head.classifier.bias"),
|
||||
("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"),
|
||||
("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"),
|
||||
]
|
||||
)
|
||||
else:
|
||||
# layernorm + classification head
|
||||
rename_keys.extend(
|
||||
[
|
||||
("fc_norm.weight", f"{hf_prefix}pooler.layernorm.weight"),
|
||||
("fc_norm.bias", f"{hf_prefix}pooler.layernorm.bias"),
|
||||
("head.weight", "classifier.weight"),
|
||||
("head.bias", "classifier.bias"),
|
||||
]
|
||||
)
|
||||
|
||||
return rename_keys
|
||||
|
||||
|
||||
def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False, hf_prefix="data2vec_vision."):
|
||||
for i in range(config.num_hidden_layers):
|
||||
prefix = "backbone." if is_semantic else ""
|
||||
# queries, keys and values
|
||||
in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight")
|
||||
q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias")
|
||||
v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias")
|
||||
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[
|
||||
: config.hidden_size, :
|
||||
]
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.query.bias"] = q_bias
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[
|
||||
config.hidden_size : config.hidden_size * 2, :
|
||||
]
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[
|
||||
-config.hidden_size :, :
|
||||
]
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.attention.attention.value.bias"] = v_bias
|
||||
|
||||
# gamma_1 and gamma_2
|
||||
# we call them lambda because otherwise they are renamed when using .from_pretrained
|
||||
gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1")
|
||||
gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2")
|
||||
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_1"] = gamma_1
|
||||
state_dict[f"{hf_prefix}encoder.layer.{i}.lambda_2"] = gamma_2
|
||||
|
||||
# relative_position bias table + index
|
||||
if not has_lm_head:
|
||||
# each layer has its own relative position bias
|
||||
table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table")
|
||||
index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index")
|
||||
|
||||
state_dict[
|
||||
f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table"
|
||||
] = table
|
||||
state_dict[
|
||||
f"{hf_prefix}encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index"
|
||||
] = index
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
"Convert Data2VecVision to HF for image classification and pretraining", add_help=False
|
||||
)
|
||||
parser.add_argument("--hf_checkpoint_name", type=str)
|
||||
parser.add_argument("--input_size", default=224, type=int, help="images input size")
|
||||
parser.add_argument("--beit_checkpoint", default="", help="beit checkpoint")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_beit_model(args, is_finetuned, is_large):
|
||||
def load_state_dict(model, state_dict, prefix="", ignore_missing="relative_position_index"):
|
||||
missing_keys = []
|
||||
unexpected_keys = []
|
||||
error_msgs = []
|
||||
# copy state_dict so _load_from_state_dict can modify it
|
||||
metadata = getattr(state_dict, "_metadata", None)
|
||||
state_dict = state_dict.copy()
|
||||
if metadata is not None:
|
||||
state_dict._metadata = metadata
|
||||
|
||||
def load(module, prefix=""):
|
||||
local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
|
||||
module._load_from_state_dict(
|
||||
state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
|
||||
)
|
||||
for name, child in module._modules.items():
|
||||
if child is not None:
|
||||
load(child, prefix + name + ".")
|
||||
|
||||
load(model, prefix=prefix)
|
||||
|
||||
warn_missing_keys = []
|
||||
ignore_missing_keys = []
|
||||
for key in missing_keys:
|
||||
keep_flag = True
|
||||
for ignore_key in ignore_missing.split("|"):
|
||||
if ignore_key in key:
|
||||
keep_flag = False
|
||||
break
|
||||
if keep_flag:
|
||||
warn_missing_keys.append(key)
|
||||
else:
|
||||
ignore_missing_keys.append(key)
|
||||
|
||||
missing_keys = warn_missing_keys
|
||||
|
||||
if len(missing_keys) > 0:
|
||||
print(
|
||||
"Weights of {} not initialized from pretrained model: {}".format(
|
||||
model.__class__.__name__, missing_keys
|
||||
)
|
||||
)
|
||||
if len(unexpected_keys) > 0:
|
||||
print("Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys))
|
||||
if len(ignore_missing_keys) > 0:
|
||||
print(
|
||||
"Ignored weights of {} not initialized from pretrained model: {}".format(
|
||||
model.__class__.__name__, ignore_missing_keys
|
||||
)
|
||||
)
|
||||
if len(error_msgs) > 0:
|
||||
print("\n".join(error_msgs))
|
||||
|
||||
model_kwargs = {
|
||||
"pretrained": False,
|
||||
"use_shared_rel_pos_bias": True,
|
||||
"use_abs_pos_emb": False,
|
||||
"init_values": 0.1,
|
||||
}
|
||||
|
||||
if is_finetuned:
|
||||
model_kwargs.update(
|
||||
{
|
||||
"num_classes": 1000,
|
||||
"use_mean_pooling": True,
|
||||
"init_scale": 0.001,
|
||||
"use_rel_pos_bias": True,
|
||||
}
|
||||
)
|
||||
|
||||
model = create_model(
|
||||
"beit_large_patch16_224" if is_large else "beit_base_patch16_224",
|
||||
**model_kwargs,
|
||||
)
|
||||
patch_size = model.patch_embed.patch_size
|
||||
args.window_size = (args.input_size // patch_size[0], args.input_size // patch_size[1])
|
||||
checkpoint = torch.load(args.beit_checkpoint, map_location="cpu")
|
||||
|
||||
print(f"Load ckpt from {args.beit_checkpoint}")
|
||||
checkpoint_model = None
|
||||
for model_key in ("model", "module"):
|
||||
if model_key in checkpoint:
|
||||
checkpoint_model = checkpoint[model_key]
|
||||
print(f"Load state_dict by model_key = {model_key}")
|
||||
break
|
||||
|
||||
all_keys = list(checkpoint_model.keys())
|
||||
for key in all_keys:
|
||||
if "relative_position_index" in key:
|
||||
checkpoint_model.pop(key)
|
||||
|
||||
if "relative_position_bias_table" in key:
|
||||
rel_pos_bias = checkpoint_model[key]
|
||||
src_num_pos, num_attn_heads = rel_pos_bias.size()
|
||||
dst_num_pos, _ = model.state_dict()[key].size()
|
||||
dst_patch_shape = model.patch_embed.patch_shape
|
||||
if dst_patch_shape[0] != dst_patch_shape[1]:
|
||||
raise NotImplementedError()
|
||||
|
||||
load_state_dict(model, checkpoint_model, prefix="")
|
||||
|
||||
return model
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
|
||||
is_finetuned = "ft1k" in args.hf_checkpoint_name
|
||||
is_large = "large" in args.hf_checkpoint_name
|
||||
|
||||
if is_finetuned:
|
||||
# To convert Beit's data2vec_vision to HF you need to copy
|
||||
# https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_finetune.py
|
||||
# into this folder.
|
||||
import modeling_finetune # noqa: F401
|
||||
else:
|
||||
# To convert Beit's data2vec_vision to HF you need to copy
|
||||
# https://github.com/facebookresearch/data2vec_vision/blob/main/beit/modeling_cyclical.py
|
||||
# into this folder
|
||||
# IMPORTANT: Note that for now we've only converted the down-stream
|
||||
# model and not the full pretrained model. This means for the integration
|
||||
# test you need to add a `return x` after the following line:
|
||||
# https://github.com/facebookresearch/data2vec_vision/blob/af9a36349aaed59ae66e69b5dabeef2d62fdc5da/beit/modeling_cyclical.py#L197
|
||||
# to make the integration test pass.
|
||||
import modeling_cyclical # noqa: F401
|
||||
|
||||
# 1. Create model config
|
||||
config = Data2VecVisionConfig()
|
||||
if is_finetuned:
|
||||
config.use_relative_position_bias = True
|
||||
config.use_shared_relative_position_bias = False
|
||||
config.use_mean_pooling = True
|
||||
config.num_labels = 1000
|
||||
|
||||
repo_id = "datasets/huggingface/label-files"
|
||||
filename = "imagenet-1k-id2label.json"
|
||||
id2label = json.load(open(hf_hub_download(repo_id, filename), "r"))
|
||||
id2label = {int(k): v for k, v in id2label.items()}
|
||||
config.id2label = id2label
|
||||
config.label2id = {v: k for k, v in id2label.items()}
|
||||
else:
|
||||
config.use_relative_position_bias = False
|
||||
config.use_shared_relative_position_bias = True
|
||||
config.use_mean_pooling = False
|
||||
|
||||
if is_large:
|
||||
config.hidden_size = 1024
|
||||
config.intermediate_size = 4096
|
||||
config.num_hidden_layers = 24
|
||||
config.num_attention_heads = 16
|
||||
|
||||
# 2. Load Beit model
|
||||
orig_model = load_beit_model(args, is_finetuned, is_large)
|
||||
orig_model.eval()
|
||||
|
||||
# 3. Forward Beit model
|
||||
feature_extractor = BeitFeatureExtractor(size=config.image_size, do_center_crop=False)
|
||||
image = Image.open("../../../../tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
encoding = feature_extractor(images=image, return_tensors="pt")
|
||||
pixel_values = encoding["pixel_values"]
|
||||
|
||||
orig_args = (pixel_values,) if is_finetuned else (pixel_values, None)
|
||||
with torch.no_grad():
|
||||
orig_model_output = orig_model(*orig_args)
|
||||
|
||||
# 4. Load HF Data2VecVision model
|
||||
if is_finetuned:
|
||||
hf_model = Data2VecVisionForImageClassification(config)
|
||||
hf_model.eval()
|
||||
has_lm_head = False
|
||||
hf_prefix = "data2vec_vision."
|
||||
else:
|
||||
hf_model = Data2VecVisionModel(config)
|
||||
hf_model.eval()
|
||||
has_lm_head = True
|
||||
hf_prefix = ""
|
||||
|
||||
rename_keys = create_rename_keys(config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
|
||||
state_dict = orig_model.state_dict()
|
||||
for src, dest in rename_keys:
|
||||
val = state_dict.pop(src)
|
||||
state_dict[dest] = val
|
||||
|
||||
read_in_q_k_v(state_dict, config, hf_prefix=hf_prefix, has_lm_head=has_lm_head)
|
||||
missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False)
|
||||
print("HF missing", missing_keys)
|
||||
print("HF unexpected_keys", unexpected_keys)
|
||||
|
||||
# 5. Forward HF Data2VecVision model
|
||||
with torch.no_grad():
|
||||
hf_model_output = hf_model(pixel_values)
|
||||
|
||||
hf_output = hf_model_output.logits if is_finetuned else hf_model_output.last_hidden_state
|
||||
|
||||
# 6. Compare
|
||||
max_absolute_diff = torch.max(torch.abs(hf_output - orig_model_output)).item()
|
||||
|
||||
print(f"max_absolute_diff = {max_absolute_diff}")
|
||||
success = torch.allclose(hf_output, orig_model_output, atol=1e-3)
|
||||
print("Do both models output the same tensors?", "🔥" if success else "💩")
|
||||
if not success:
|
||||
raise Exception("Something went wRoNg")
|
||||
|
||||
# 7. Save
|
||||
print(f"Saving to {args.hf_checkpoint_name}")
|
||||
hf_model.save_pretrained(args.hf_checkpoint_name)
|
||||
feature_extractor.save_pretrained(args.hf_checkpoint_name)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
# Run the following to convert checkpoints
|
||||
# python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
|
||||
# --beit_checkpoint ./pretrained_base.pt \
|
||||
# --hf_checkpoint_name "./data2vec-vision-base"
|
||||
# python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
|
||||
# --beit_checkpoint ./finetuned_base.pt \
|
||||
# --hf_checkpoint_name "./data2vec-vision-base-ft1k"
|
||||
# python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
|
||||
# --beit_checkpoint ./pretrained_large.pt \
|
||||
# --hf_checkpoint_name "./data2vec-vision-large"
|
||||
# python ./convert_data2vec_vision_original_pytorch_checkpoint_to_pytorch.py \
|
||||
# --beit_checkpoint ./finetuned_large.pt \
|
||||
# --hf_checkpoint_name "./data2vec-vision-large-ft1k"
|
File diff suppressed because it is too large
Load Diff
|
@ -1222,6 +1222,9 @@ DATA2VEC_AUDIO_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
|||
DATA2VEC_TEXT_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
class Data2VecAudioForAudioFrameClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
@ -1320,6 +1323,34 @@ class Data2VecTextPreTrainedModel(metaclass=DummyObject):
|
|||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Data2VecVisionForImageClassification(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Data2VecVisionForSemanticSegmentation(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Data2VecVisionModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
class Data2VecVisionPreTrainedModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
|
||||
DEBERTA_PRETRAINED_MODEL_ARCHIVE_LIST = None
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,444 @@
|
|||
# coding=utf-8
|
||||
# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
""" Testing suite for the PyTorch Data2VecVision model. """
|
||||
|
||||
|
||||
import inspect
|
||||
import unittest
|
||||
|
||||
from transformers import Data2VecVisionConfig
|
||||
from transformers.models.auto import get_values
|
||||
from transformers.testing_utils import require_torch, require_vision, slow, torch_device
|
||||
from transformers.utils import cached_property, is_torch_available, is_vision_available
|
||||
|
||||
from ..test_configuration_common import ConfigTester
|
||||
from ..test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import (
|
||||
MODEL_MAPPING,
|
||||
Data2VecVisionForImageClassification,
|
||||
Data2VecVisionForSemanticSegmentation,
|
||||
Data2VecVisionModel,
|
||||
)
|
||||
from transformers.models.data2vec.modeling_data2vec_vision import (
|
||||
DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST,
|
||||
to_2tuple,
|
||||
)
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
|
||||
from transformers import BeitFeatureExtractor
|
||||
|
||||
|
||||
class Data2VecVisionModelTester:
|
||||
def __init__(
|
||||
self,
|
||||
parent,
|
||||
vocab_size=100,
|
||||
batch_size=13,
|
||||
image_size=30,
|
||||
patch_size=2,
|
||||
num_channels=3,
|
||||
is_training=True,
|
||||
use_labels=True,
|
||||
hidden_size=32,
|
||||
num_hidden_layers=4,
|
||||
num_attention_heads=4,
|
||||
intermediate_size=37,
|
||||
hidden_act="gelu",
|
||||
hidden_dropout_prob=0.1,
|
||||
attention_probs_dropout_prob=0.1,
|
||||
type_sequence_label_size=10,
|
||||
initializer_range=0.02,
|
||||
num_labels=3,
|
||||
scope=None,
|
||||
out_indices=[0, 1, 2, 3],
|
||||
):
|
||||
self.parent = parent
|
||||
self.vocab_size = 100
|
||||
self.batch_size = batch_size
|
||||
self.image_size = image_size
|
||||
self.patch_size = patch_size
|
||||
self.num_channels = num_channels
|
||||
self.is_training = is_training
|
||||
self.use_labels = use_labels
|
||||
self.hidden_size = hidden_size
|
||||
self.num_hidden_layers = num_hidden_layers
|
||||
self.num_attention_heads = num_attention_heads
|
||||
self.intermediate_size = intermediate_size
|
||||
self.hidden_act = hidden_act
|
||||
self.hidden_dropout_prob = hidden_dropout_prob
|
||||
self.attention_probs_dropout_prob = attention_probs_dropout_prob
|
||||
self.type_sequence_label_size = type_sequence_label_size
|
||||
self.initializer_range = initializer_range
|
||||
self.scope = scope
|
||||
self.out_indices = out_indices
|
||||
self.num_labels = num_labels
|
||||
|
||||
def prepare_config_and_inputs(self):
|
||||
pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size])
|
||||
|
||||
labels = None
|
||||
pixel_labels = None
|
||||
if self.use_labels:
|
||||
labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
|
||||
pixel_labels = ids_tensor([self.batch_size, self.image_size, self.image_size], self.num_labels)
|
||||
|
||||
config = self.get_config()
|
||||
|
||||
return config, pixel_values, labels, pixel_labels
|
||||
|
||||
def get_config(self):
|
||||
return Data2VecVisionConfig(
|
||||
vocab_size=self.vocab_size,
|
||||
image_size=self.image_size,
|
||||
patch_size=self.patch_size,
|
||||
num_channels=self.num_channels,
|
||||
hidden_size=self.hidden_size,
|
||||
num_hidden_layers=self.num_hidden_layers,
|
||||
num_attention_heads=self.num_attention_heads,
|
||||
intermediate_size=self.intermediate_size,
|
||||
hidden_act=self.hidden_act,
|
||||
hidden_dropout_prob=self.hidden_dropout_prob,
|
||||
attention_probs_dropout_prob=self.attention_probs_dropout_prob,
|
||||
is_decoder=False,
|
||||
initializer_range=self.initializer_range,
|
||||
out_indices=self.out_indices,
|
||||
)
|
||||
|
||||
def create_and_check_model(self, config, pixel_values, labels, pixel_labels):
|
||||
model = Data2VecVisionModel(config=config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values)
|
||||
# expected sequence length = num_patches + 1 (we add 1 for the [CLS] token)
|
||||
image_size = to_2tuple(self.image_size)
|
||||
patch_size = to_2tuple(self.patch_size)
|
||||
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||
self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, num_patches + 1, self.hidden_size))
|
||||
|
||||
def create_and_check_for_image_classification(self, config, pixel_values, labels, pixel_labels):
|
||||
config.num_labels = self.type_sequence_label_size
|
||||
model = Data2VecVisionForImageClassification(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values, labels=labels)
|
||||
self.parent.assertEqual(result.logits.shape, (self.batch_size, self.type_sequence_label_size))
|
||||
|
||||
def create_and_check_for_image_segmentation(self, config, pixel_values, labels, pixel_labels):
|
||||
config.num_labels = self.num_labels
|
||||
model = Data2VecVisionForSemanticSegmentation(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
result = model(pixel_values)
|
||||
self.parent.assertEqual(
|
||||
result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
|
||||
)
|
||||
result = model(pixel_values, labels=pixel_labels)
|
||||
self.parent.assertEqual(
|
||||
result.logits.shape, (self.batch_size, self.num_labels, self.image_size * 2, self.image_size * 2)
|
||||
)
|
||||
|
||||
def prepare_config_and_inputs_for_common(self):
|
||||
config_and_inputs = self.prepare_config_and_inputs()
|
||||
config, pixel_values, labels, pixel_labels = config_and_inputs
|
||||
inputs_dict = {"pixel_values": pixel_values}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class Data2VecVisionModelTest(ModelTesterMixin, unittest.TestCase):
|
||||
"""
|
||||
Here we also overwrite some of the tests of test_modeling_common.py, as Data2VecVision does not use input_ids, inputs_embeds,
|
||||
attention_mask and seq_length.
|
||||
"""
|
||||
|
||||
all_model_classes = (
|
||||
(Data2VecVisionModel, Data2VecVisionForImageClassification, Data2VecVisionForSemanticSegmentation)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
|
||||
test_pruning = False
|
||||
test_resize_embeddings = False
|
||||
test_head_masking = False
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = Data2VecVisionModelTester(self)
|
||||
self.config_tester = ConfigTester(
|
||||
self, config_class=Data2VecVisionConfig, has_text_modality=False, hidden_size=37
|
||||
)
|
||||
|
||||
def test_config(self):
|
||||
self.config_tester.run_common_tests()
|
||||
|
||||
def test_inputs_embeds(self):
|
||||
# Data2VecVision does not use inputs_embeds
|
||||
pass
|
||||
|
||||
def test_model_common_attributes(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
self.assertIsInstance(model.get_input_embeddings(), (nn.Module))
|
||||
x = model.get_output_embeddings()
|
||||
self.assertTrue(x is None or isinstance(x, nn.Linear))
|
||||
|
||||
def test_forward_signature(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
signature = inspect.signature(model.forward)
|
||||
# signature.parameters is an OrderedDict => so arg_names order is deterministic
|
||||
arg_names = [*signature.parameters.keys()]
|
||||
|
||||
expected_arg_names = ["pixel_values"]
|
||||
self.assertListEqual(arg_names[:1], expected_arg_names)
|
||||
|
||||
def test_model(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_model(*config_and_inputs)
|
||||
|
||||
def test_for_image_segmentation(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_segmentation(*config_and_inputs)
|
||||
|
||||
def test_training(self):
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if model_class in [*get_values(MODEL_MAPPING)]:
|
||||
continue
|
||||
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
|
||||
loss = model(**inputs).loss
|
||||
loss.backward()
|
||||
|
||||
def test_training_gradient_checkpointing(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
if not self.model_tester.is_training:
|
||||
return
|
||||
|
||||
config.use_cache = False
|
||||
config.return_dict = True
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if model_class in [*get_values(MODEL_MAPPING)] or not model_class.supports_gradient_checkpointing:
|
||||
continue
|
||||
# TODO: remove the following 3 lines once we have a MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING
|
||||
# this can then be incorporated into _prepare_for_class in test_modeling_common.py
|
||||
elif model_class.__name__ == "Data2VecVisionForSemanticSegmentation":
|
||||
batch_size, num_channels, height, width = inputs_dict["pixel_values"].shape
|
||||
inputs_dict["labels"] = torch.zeros(
|
||||
[self.model_tester.batch_size, height, width], device=torch_device
|
||||
).long()
|
||||
model = model_class(config)
|
||||
model.gradient_checkpointing_enable()
|
||||
model.to(torch_device)
|
||||
model.train()
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True)
|
||||
loss = model(**inputs).loss
|
||||
loss.backward()
|
||||
|
||||
def test_initialization(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
configs_no_init = _config_zero_init(config)
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config=configs_no_init)
|
||||
for name, param in model.named_parameters():
|
||||
# we skip lambda parameters as these require special initial values
|
||||
# determined by config.layer_scale_init_value
|
||||
if "lambda" in name:
|
||||
continue
|
||||
if param.requires_grad:
|
||||
self.assertIn(
|
||||
((param.data.mean() * 1e9).round() / 1e9).item(),
|
||||
[0.0, 1.0],
|
||||
msg=f"Parameter {name} of model {model_class} seems not properly initialized",
|
||||
)
|
||||
|
||||
def test_attention_outputs(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.return_dict = True
|
||||
|
||||
# in Data2VecVision, the seq_len equals the number of patches + 1 (we add 1 for the [CLS] token)
|
||||
image_size = to_2tuple(self.model_tester.image_size)
|
||||
patch_size = to_2tuple(self.model_tester.patch_size)
|
||||
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||
seq_len = num_patches + 1
|
||||
encoder_seq_length = getattr(self.model_tester, "encoder_seq_length", seq_len)
|
||||
encoder_key_length = getattr(self.model_tester, "key_length", encoder_seq_length)
|
||||
chunk_length = getattr(self.model_tester, "chunk_length", None)
|
||||
if chunk_length is not None and hasattr(self.model_tester, "num_hashes"):
|
||||
encoder_seq_length = encoder_seq_length * self.model_tester.num_hashes
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = False
|
||||
config.return_dict = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
attentions = outputs.encoder_attentions if config.is_encoder_decoder else outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||||
|
||||
# check that output_attentions also work using config
|
||||
del inputs_dict["output_attentions"]
|
||||
config.output_attentions = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
attentions = outputs.attentions
|
||||
self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
|
||||
|
||||
self.assertListEqual(
|
||||
list(attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
out_len = len(outputs)
|
||||
|
||||
# Check attention is always last and order is fine
|
||||
inputs_dict["output_attentions"] = True
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
self.assertEqual(out_len + 1, len(outputs))
|
||||
|
||||
self_attentions = outputs.attentions
|
||||
|
||||
self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
|
||||
self.assertListEqual(
|
||||
list(self_attentions[0].shape[-3:]),
|
||||
[self.model_tester.num_attention_heads, encoder_seq_length, encoder_key_length],
|
||||
)
|
||||
|
||||
def test_hidden_states_output(self):
|
||||
def check_hidden_states_output(inputs_dict, config, model_class):
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs = model(**self._prepare_for_class(inputs_dict, model_class))
|
||||
|
||||
hidden_states = outputs.encoder_hidden_states if config.is_encoder_decoder else outputs.hidden_states
|
||||
|
||||
expected_num_layers = getattr(
|
||||
self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1
|
||||
)
|
||||
self.assertEqual(len(hidden_states), expected_num_layers)
|
||||
|
||||
# Data2VecVision has a different seq_length
|
||||
image_size = to_2tuple(self.model_tester.image_size)
|
||||
patch_size = to_2tuple(self.model_tester.patch_size)
|
||||
num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
|
||||
seq_length = num_patches + 1
|
||||
|
||||
self.assertListEqual(
|
||||
list(hidden_states[0].shape[-2:]),
|
||||
[seq_length, self.model_tester.hidden_size],
|
||||
)
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
inputs_dict["output_hidden_states"] = True
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
# check that output_hidden_states also work using config
|
||||
del inputs_dict["output_hidden_states"]
|
||||
config.output_hidden_states = True
|
||||
|
||||
check_hidden_states_output(inputs_dict, config, model_class)
|
||||
|
||||
def test_for_image_classification(self):
|
||||
config_and_inputs = self.model_tester.prepare_config_and_inputs()
|
||||
self.model_tester.create_and_check_for_image_classification(*config_and_inputs)
|
||||
|
||||
@slow
|
||||
def test_model_from_pretrained(self):
|
||||
for model_name in DATA2VEC_VISION_PRETRAINED_MODEL_ARCHIVE_LIST[:1]:
|
||||
model = Data2VecVisionModel.from_pretrained(model_name)
|
||||
self.assertIsNotNone(model)
|
||||
|
||||
|
||||
# We will verify our results on an image of cute cats
|
||||
def prepare_img():
|
||||
image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png")
|
||||
return image
|
||||
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
class Data2VecVisionModelIntegrationTest(unittest.TestCase):
|
||||
@cached_property
|
||||
def default_feature_extractor(self):
|
||||
return (
|
||||
BeitFeatureExtractor.from_pretrained("facebook/data2vec-vision-base-ft1k")
|
||||
if is_vision_available()
|
||||
else None
|
||||
)
|
||||
|
||||
@slow
|
||||
def test_inference_image_classification_head_imagenet_1k(self):
|
||||
model = Data2VecVisionForImageClassification.from_pretrained("facebook/data2vec-vision-base-ft1k").to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
feature_extractor = self.default_feature_extractor
|
||||
image = prepare_img()
|
||||
inputs = feature_extractor(images=image, return_tensors="pt").to(torch_device)
|
||||
|
||||
# forward pass
|
||||
with torch.no_grad():
|
||||
outputs = model(**inputs)
|
||||
logits = outputs.logits
|
||||
|
||||
# verify the logits
|
||||
expected_shape = torch.Size((1, 1000))
|
||||
self.assertEqual(logits.shape, expected_shape)
|
||||
|
||||
expected_slice = torch.tensor([0.3277, -0.1395, 0.0911]).to(torch_device)
|
||||
|
||||
self.assertTrue(torch.allclose(logits[0, :3], expected_slice, atol=1e-4))
|
||||
|
||||
expected_top2 = [model.config.label2id[i] for i in ["remote control, remote", "tabby, tabby cat"]]
|
||||
self.assertEqual(logits[0].topk(2).indices.cpu().tolist(), expected_top2)
|
|
@ -186,6 +186,7 @@ MODEL_TYPE_TO_DOC_MAPPING = OrderedDict(
|
|||
[
|
||||
("data2vec-text", "data2vec"),
|
||||
("data2vec-audio", "data2vec"),
|
||||
("data2vec-vision", "data2vec"),
|
||||
]
|
||||
)
|
||||
|
||||
|
|
|
@ -15,6 +15,7 @@ src/transformers/models/blenderbot/modeling_blenderbot.py
|
|||
src/transformers/models/blenderbot_small/modeling_blenderbot_small.py
|
||||
src/transformers/models/convnext/modeling_convnext.py
|
||||
src/transformers/models/data2vec/modeling_data2vec_audio.py
|
||||
src/transformers/models/data2vec/modeling_data2vec_vision.py
|
||||
src/transformers/models/deit/modeling_deit.py
|
||||
src/transformers/models/dpt/modeling_dpt.py
|
||||
src/transformers/models/electra/modeling_electra.py
|
||||
|
|
Loading…
Reference in New Issue