From 33288ff15011ad4291effa3f1e4912acecc24399 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 2 Apr 2024 11:18:03 +0200 Subject: [PATCH 001/110] [bnb] Fix bug in `_replace_with_bnb_linear` (#29958) fix bug --- src/transformers/integrations/bitsandbytes.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/transformers/integrations/bitsandbytes.py b/src/transformers/integrations/bitsandbytes.py index e038768b97..f340c1db82 100644 --- a/src/transformers/integrations/bitsandbytes.py +++ b/src/transformers/integrations/bitsandbytes.py @@ -156,7 +156,10 @@ def _replace_with_bnb_linear( if (isinstance(module, nn.Linear) or isinstance(module, Conv1D)) and name not in modules_to_not_convert: # Check if the current key is not in the `modules_to_not_convert` - if not any(key in ".".join(current_key_name) for key in modules_to_not_convert): + current_key_name_str = ".".join(current_key_name) + if not any( + (key + "." in current_key_name_str) or (key == current_key_name_str) for key in modules_to_not_convert + ): with init_empty_weights(): if isinstance(module, Conv1D): in_features, out_features = module.weight.shape From fed27ffc7ec62837dca9bbfc83442eb3678ee026 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?th=C3=A9o=20gigant?= <71786646+giganttheo@users.noreply.github.com> Date: Tue, 2 Apr 2024 11:39:33 +0200 Subject: [PATCH 002/110] Adding FlaxNoRepeatNGramLogitsProcessor (#29677) * fix issue with logit processor in beam search in Flax * adding FlaxNoRepeatNGramLogitsProcessor class + unit test * style correction and code verification * add FlaxNoRepeatNGramLogitsProcessor to the test_processor_list and test_processor_list_jitted tests * fix an issue where ngrams are banned only if they appear ==1 time + update description of get_previous_ngrams * replace non-jit compatible masking of ngrams that are not yet generated with jittable version * Revert "fix issue with logit processor in beam search in Flax" This reverts commit 09b70d7e4dc32d0cc4db61af09a835a9cd238b50. * add FlaxNoRepeatNGramLogitsProcessor to _get_logits_processor * change the method of casting to boolean of banned tokens indices * fix code style * remove some useless operations + significantly faster computation of update indices using jax.lax.fori_loop * remove useless loop iterations * set some variables that were calculated and used multiple times * fix format --- src/transformers/generation/__init__.py | 2 + .../generation/flax_logits_process.py | 87 +++++++++++++++++++ src/transformers/generation/flax_utils.py | 3 + tests/generation/test_flax_logits_process.py | 45 +++++++++- 4 files changed, 135 insertions(+), 2 deletions(-) diff --git a/src/transformers/generation/__init__.py b/src/transformers/generation/__init__.py index 315d5b08a7..6653f3c8d1 100644 --- a/src/transformers/generation/__init__.py +++ b/src/transformers/generation/__init__.py @@ -162,6 +162,7 @@ else: "FlaxTopKLogitsWarper", "FlaxTopPLogitsWarper", "FlaxWhisperTimeStampLogitsProcessor", + "FlaxNoRepeatNGramLogitsProcessor", ] _import_structure["flax_utils"] = [ "FlaxGenerationMixin", @@ -294,6 +295,7 @@ if TYPE_CHECKING: FlaxLogitsProcessorList, FlaxLogitsWarper, FlaxMinLengthLogitsProcessor, + FlaxNoRepeatNGramLogitsProcessor, FlaxSuppressTokensAtBeginLogitsProcessor, FlaxSuppressTokensLogitsProcessor, FlaxTemperatureLogitsWarper, diff --git a/src/transformers/generation/flax_logits_process.py b/src/transformers/generation/flax_logits_process.py index 5c30b92755..84b5a38d5d 100644 --- a/src/transformers/generation/flax_logits_process.py +++ b/src/transformers/generation/flax_logits_process.py @@ -18,6 +18,7 @@ import inspect import jax import jax.lax as lax import jax.numpy as jnp +from jax.experimental import sparse from ..utils import add_start_docstrings from ..utils.logging import get_logger @@ -455,3 +456,89 @@ class FlaxWhisperTimeStampLogitsProcessor(FlaxLogitsProcessor): scores = jax.vmap(handle_cumulative_probs)(logprobs, scores) return scores + + +class FlaxNoRepeatNGramLogitsProcessor(FlaxLogitsProcessor): + r""" + [`FlaxLogitsProcessor`] that enforces no repetition of n-grams. See + [Fairseq](https://github.com/pytorch/fairseq/blob/a07cb6f40480928c9e0548b737aadd36ee66ac76/fairseq/sequence_generator.py#L345). + + Args: + ngram_size (`int`): + All ngrams of size `ngram_size` can only occur once. + """ + + def __init__(self, ngram_size: int): + if not isinstance(ngram_size, int) or ngram_size <= 0: + raise ValueError(f"`ngram_size` has to be a strictly positive integer, but is {ngram_size}") + self.ngram_size = ngram_size + + def get_previous_ngrams(self, input_ids: jnp.ndarray, vocab_size: int, cur_len: int): + """ + get a matrix of size (batch_size,) + (vocab_size,)*n (for n-grams) that + represent the n-grams that occured previously. + The BCOO representation allow to store only the few non-zero entries, instead of the full (huge) matrix + """ + batch_size, seq_len = input_ids.shape + # number of n-grams in the whole sequence + seq_ngrams = seq_len - (self.ngram_size - 1) + # number of n-grams in the currently generated sequence + cur_ngrams = cur_len - (self.ngram_size - 1) + + def body_fun(i, val): + b = i % batch_size + pos = i // batch_size + return val.at[i].set( + jnp.array( + [ + b, + ] + + [jnp.array(input_ids)[b, pos + j] for j in range(self.ngram_size)] + ) + ) + + shape = (batch_size * seq_ngrams, self.ngram_size + 1) + all_update_indices = jax.lax.fori_loop( + 0, batch_size * cur_ngrams, body_fun, jnp.zeros(shape, dtype=input_ids.dtype) + ) + + # ignore the n-grams not yet generated + data = (jnp.arange(batch_size * seq_ngrams) < batch_size * cur_ngrams).astype("float32") + + return sparse.BCOO((data, all_update_indices), shape=(batch_size,) + (vocab_size,) * self.ngram_size) + + def get_banned_tokens_mask(self, latest_tokens: jnp.ndarray, previous_ngrams) -> jnp.ndarray: + """ + Determines which tokens must be banned given latest tokens and the previously seen + ngrams. + """ + + @sparse.sparsify + @jax.vmap + def inner_fn(latest_tokens, previous_ngrams): + return previous_ngrams[tuple(latest_tokens)] + + return sparse.bcoo_todense(inner_fn(latest_tokens, previous_ngrams)) + + def __call__(self, input_ids: jnp.ndarray, scores: jnp.ndarray, cur_len: int) -> jnp.ndarray: + def true_fn(): + _, vocab_size = scores.shape + # store the previously seen n-grams + previous_ngrams = self.get_previous_ngrams(input_ids, vocab_size, cur_len) + + # get the n-1 last tokens that prefix the n-gram being generated + latest_tokens = jnp.zeros((input_ids.shape[0], self.ngram_size - 1), dtype=input_ids.dtype) + latest_tokens = jax.lax.dynamic_update_slice( + latest_tokens, + jax.lax.dynamic_slice( + input_ids, (0, cur_len - (self.ngram_size - 1)), (input_ids.shape[0], (self.ngram_size - 1)) + ), + (0, 0), + ) + + # compute the banned tokens, ie all the tokens that when added to the latest tokens lead to a n-gram that was previously generated + banned_tokens_indices_mask = self.get_banned_tokens_mask(latest_tokens, previous_ngrams).astype("bool") + return jnp.where(banned_tokens_indices_mask, -float("inf"), scores) + + output = jax.lax.cond((cur_len >= self.ngram_size - 1), true_fn, lambda: scores) + return output diff --git a/src/transformers/generation/flax_utils.py b/src/transformers/generation/flax_utils.py index 3a89c1ed41..08480ac983 100644 --- a/src/transformers/generation/flax_utils.py +++ b/src/transformers/generation/flax_utils.py @@ -40,6 +40,7 @@ from .flax_logits_process import ( FlaxForceTokensLogitsProcessor, FlaxLogitsProcessorList, FlaxMinLengthLogitsProcessor, + FlaxNoRepeatNGramLogitsProcessor, FlaxSuppressTokensAtBeginLogitsProcessor, FlaxSuppressTokensLogitsProcessor, FlaxTemperatureLogitsWarper, @@ -534,6 +535,8 @@ class FlaxGenerationMixin: [input_ids_seq_length + i[0] - 1, i[1]] for i in generation_config.forced_decoder_ids ] processors.append(FlaxForceTokensLogitsProcessor(forced_decoder_ids)) + if generation_config.no_repeat_ngram_size is not None and generation_config.no_repeat_ngram_size > 0: + processors.append(FlaxNoRepeatNGramLogitsProcessor(generation_config.no_repeat_ngram_size)) processors = self._merge_criteria_processor_list(processors, logits_processor) return processors diff --git a/tests/generation/test_flax_logits_process.py b/tests/generation/test_flax_logits_process.py index a45d75ae24..bd5f8f648c 100644 --- a/tests/generation/test_flax_logits_process.py +++ b/tests/generation/test_flax_logits_process.py @@ -33,6 +33,7 @@ if is_flax_available(): FlaxForcedEOSTokenLogitsProcessor, FlaxLogitsProcessorList, FlaxMinLengthLogitsProcessor, + FlaxNoRepeatNGramLogitsProcessor, FlaxTemperatureLogitsWarper, FlaxTopKLogitsWarper, FlaxTopPLogitsWarper, @@ -197,6 +198,26 @@ class LogitsProcessorTest(unittest.TestCase): scores = logits_processor(input_ids, scores, cur_len=cur_len) self.assertFalse(jnp.isinf(scores).any()) + def test_no_repeat_ngram_dist_processor(self): + vocab_size = 3 + batch_size = 2 + + cur_len = 4 + input_ids = np.array([[1, 1, 2, 1], [0, 1, 0, 1]], dtype="i4") + scores = self._get_uniform_logits(batch_size, vocab_size) + + no_repeat_proc_2_gram = FlaxNoRepeatNGramLogitsProcessor(2) + no_repeat_proc_3_gram = FlaxNoRepeatNGramLogitsProcessor(3) + + filtered_scores_2_gram = no_repeat_proc_2_gram(input_ids, scores, cur_len=cur_len) + filtered_scores_3_gram = no_repeat_proc_3_gram(input_ids, scores, cur_len=cur_len) + + # 2-gram would forbid 2nd and 3rd token (1,2) at 1st batch and 1st token (0) at 2nd batch + self.assertListEqual(jnp.isinf(filtered_scores_2_gram).tolist(), [[False, True, True], [True, False, False]]) + + # 3-gram would forbid no token at 1st batch and 1st token (0) at 2nd batch + self.assertListEqual(jnp.isinf(filtered_scores_3_gram).tolist(), [[False, False, False], [True, False, False]]) + def test_processor_list(self): batch_size = 4 sequence_length = 10 @@ -216,6 +237,7 @@ class LogitsProcessorTest(unittest.TestCase): temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5) top_k_warp = FlaxTopKLogitsWarper(3) top_p_warp = FlaxTopPLogitsWarper(0.8) + no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2) # instantiate all logits processors min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id) @@ -231,10 +253,19 @@ class LogitsProcessorTest(unittest.TestCase): scores = min_dist_proc(input_ids, scores, cur_len=cur_len) scores = bos_dist_proc(input_ids, scores, cur_len=cur_len) scores = eos_dist_proc(input_ids, scores, cur_len=cur_len) + scores = no_repeat_proc(input_ids, scores, cur_len=cur_len) # with processor list processor = FlaxLogitsProcessorList( - [temp_dist_warp, top_k_warp, top_p_warp, min_dist_proc, bos_dist_proc, eos_dist_proc] + [ + temp_dist_warp, + top_k_warp, + top_p_warp, + min_dist_proc, + bos_dist_proc, + eos_dist_proc, + no_repeat_proc, + ] ) scores_comp = processor(input_ids, scores_comp, cur_len=cur_len) @@ -263,6 +294,7 @@ class LogitsProcessorTest(unittest.TestCase): temp_dist_warp = FlaxTemperatureLogitsWarper(temperature=0.5) top_k_warp = FlaxTopKLogitsWarper(3) top_p_warp = FlaxTopPLogitsWarper(0.8) + no_repeat_proc = FlaxNoRepeatNGramLogitsProcessor(2) # instantiate all logits processors min_dist_proc = FlaxMinLengthLogitsProcessor(min_length=10, eos_token_id=eos_token_id) @@ -279,12 +311,21 @@ class LogitsProcessorTest(unittest.TestCase): scores = min_dist_proc(input_ids, scores, cur_len=cur_len) scores = bos_dist_proc(input_ids, scores, cur_len=cur_len) scores = eos_dist_proc(input_ids, scores, cur_len=cur_len) + scores = no_repeat_proc(input_ids, scores, cur_len=cur_len) return scores # with processor list def run_processor_list(input_ids, scores, cur_len): processor = FlaxLogitsProcessorList( - [temp_dist_warp, top_k_warp, top_p_warp, min_dist_proc, bos_dist_proc, eos_dist_proc] + [ + temp_dist_warp, + top_k_warp, + top_p_warp, + min_dist_proc, + bos_dist_proc, + eos_dist_proc, + no_repeat_proc, + ] ) scores = processor(input_ids, scores, cur_len=cur_len) return scores From 0d04b1e25a79ef18af419881d708fafc665851c7 Mon Sep 17 00:00:00 2001 From: Yoach Lacombe <52246514+ylacombe@users.noreply.github.com> Date: Tue, 2 Apr 2024 11:23:49 +0100 Subject: [PATCH 003/110] Add Flash Attention 2 support to Musicgen and Musicgen Melody (#29939) * add FA2 to o.g Musicgen * make style * add FA2 support to Musicgen Melody * add generation FA2 tests to o.g Musicgen * make style and fix copies * add Musicgen to FA2 docs + deprecate list * add sdpa supports to Musicgen's * make style and fix copies * refactor attention implementation arguments * add Copied from to sdpa tests * add copied form in sdpa tests melody * add copied for FA2 generation tests * add FA2 inference copied from * make style --- docs/source/en/perf_infer_gpu_one.md | 4 + .../models/deprecated/_archive_maps.py | 6 + .../models/musicgen/configuration_musicgen.py | 17 + .../models/musicgen/modeling_musicgen.py | 406 +++++- .../configuration_musicgen_melody.py | 21 +- .../modeling_musicgen_melody.py | 383 ++++- .../models/musicgen/test_modeling_musicgen.py | 1250 ++++++++++++++++- .../test_modeling_musicgen_melody.py | 1250 ++++++++++++++++- 8 files changed, 3313 insertions(+), 24 deletions(-) diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 0fbea1cd8d..5683f1e78b 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -55,6 +55,8 @@ FlashAttention-2 is currently supported for the following architectures: * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel) * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel) * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel) +* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) +* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel) * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel) * [StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm#transformers.StableLmModel) @@ -190,6 +192,8 @@ For now, Transformers supports SDPA inference and training for the following arc * [Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2#transformers.Starcoder2Model) * [Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2#transformers.Qwen2Model) * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel) +* [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel) +* [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel) diff --git a/src/transformers/models/deprecated/_archive_maps.py b/src/transformers/models/deprecated/_archive_maps.py index f7b0679a3e..f195ac0706 100644 --- a/src/transformers/models/deprecated/_archive_maps.py +++ b/src/transformers/models/deprecated/_archive_maps.py @@ -1470,6 +1470,12 @@ MUSICGEN_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict( MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST = DeprecatedList(["facebook/musicgen-small"]) +MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP = DeprecatedDict( + {"facebook/musicgen-melody": "https://huggingface.co/facebook/musicgen-melody/resolve/main/config.json"} +) + +MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST = DeprecatedList(["facebook/musicgen-melody"]) + MVP_PRETRAINED_MODEL_ARCHIVE_LIST = DeprecatedList( [ "RUCAIBox/mvp", diff --git a/src/transformers/models/musicgen/configuration_musicgen.py b/src/transformers/models/musicgen/configuration_musicgen.py index 9d835835df..b102d67630 100644 --- a/src/transformers/models/musicgen/configuration_musicgen.py +++ b/src/transformers/models/musicgen/configuration_musicgen.py @@ -239,3 +239,20 @@ class MusicgenConfig(PretrainedConfig): # This is a property because you might want to change the codec model on the fly def sampling_rate(self): return self.audio_encoder.sampling_rate + + @property + def _attn_implementation(self): + # This property is made private for now (as it cannot be changed and a PreTrainedModel.use_attn_implementation method needs to be implemented.) + if hasattr(self, "_attn_implementation_internal"): + if self._attn_implementation_internal is None: + # `config.attn_implementation` should never be None, for backward compatibility. + return "eager" + else: + return self._attn_implementation_internal + else: + return "eager" + + @_attn_implementation.setter + def _attn_implementation(self, value): + self._attn_implementation_internal = value + self.decoder._attn_implementation = value diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 99e06f7df1..2520268f74 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -22,13 +22,19 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union import torch import torch.nn as nn +import torch.nn.functional as F from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...generation.configuration_utils import GenerationConfig from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList from ...generation.stopping_criteria import StoppingCriteriaList -from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) from ...modeling_outputs import ( BaseModelOutput, BaseModelOutputWithPastAndCrossAttentions, @@ -40,6 +46,8 @@ from ...modeling_utils import PreTrainedModel from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, ) @@ -48,6 +56,10 @@ from ..auto.modeling_auto import AutoModel from .configuration_musicgen import MusicgenConfig, MusicgenDecoderConfig +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + if TYPE_CHECKING: from ...generation.streamers import BaseStreamer @@ -60,6 +72,19 @@ _CHECKPOINT_FOR_DOC = "facebook/musicgen-small" from ..deprecated._archive_maps import MUSICGEN_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402 +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + @dataclass class MusicgenUnconditionalInput(ModelOutput): """ @@ -302,29 +327,361 @@ class MusicgenAttention(nn.Module): return attn_output, attn_weights_reshaped, past_key_value +# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->Musicgen +class MusicgenFlashAttention2(MusicgenAttention): + """ + Musicgen flash attention module. This module inherits from `MusicgenAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # MusicgenFlashAttention2 attention does not support output_attentions + if output_attentions: + raise ValueError("MusicgenFlashAttention2 attention does not support output_attentions") + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, q_len, _ = hidden_states.size() + + # get query proj + query_states = self._reshape(self.q_proj(hidden_states), -1, bsz) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0].transpose(1, 2) + value_states = past_key_value[1].transpose(1, 2) + elif is_cross_attention: + # cross_attentions + key_states = self._reshape(self.k_proj(key_value_states), -1, bsz) + value_states = self._reshape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) + value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) + else: + # self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout + ) + + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->Musicgen +class MusicgenSdpaAttention(MusicgenAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MusicgenModel is using MusicgenSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + query_states = self._shape(query_states, tgt_len, bsz) + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.dropout if self.training else 0.0, + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal=self.is_causal and attention_mask is None and tgt_len > 1, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + + +MUSICGEN_ATTENTION_CLASSES = { + "eager": MusicgenAttention, + "sdpa": MusicgenSdpaAttention, + "flash_attention_2": MusicgenFlashAttention2, +} + + class MusicgenDecoderLayer(nn.Module): def __init__(self, config: MusicgenDecoderConfig): super().__init__() self.embed_dim = config.hidden_size - self.self_attn = MusicgenAttention( + self.self_attn = MUSICGEN_ATTENTION_CLASSES[config._attn_implementation]( embed_dim=self.embed_dim, num_heads=config.num_attention_heads, dropout=config.attention_dropout, is_decoder=True, bias=False, + is_causal=True, + config=config, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] self.activation_dropout = config.activation_dropout self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) - self.encoder_attn = MusicgenAttention( + self.encoder_attn = MUSICGEN_ATTENTION_CLASSES[config._attn_implementation]( self.embed_dim, config.num_attention_heads, dropout=config.attention_dropout, is_decoder=True, bias=False, + config=config, ) self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) self.fc1 = nn.Linear(self.embed_dim, config.ffn_dim, bias=False) @@ -432,6 +789,8 @@ class MusicgenPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenDecoderLayer", "MusicgenAttention"] + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): std = self.config.initializer_factor @@ -667,6 +1026,7 @@ class MusicgenDecoder(MusicgenPreTrainedModel): self.layers = nn.ModuleList([MusicgenDecoderLayer(config) for _ in range(config.num_hidden_layers)]) self.layer_norm = nn.LayerNorm(config.hidden_size) + self.attn_implementation = config._attn_implementation self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -721,16 +1081,40 @@ class MusicgenDecoder(MusicgenPreTrainedModel): if inputs_embeds is None: inputs_embeds = sum([self.embed_tokens[codebook](input[:, codebook]) for codebook in range(num_codebooks)]) - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) + if self.attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self.attn_implementation == "sdpa" and head_mask is None and not output_attentions: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + inputs_embeds, + past_key_values_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) # expand encoder attention mask if encoder_hidden_states is not None and encoder_attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - encoder_attention_mask = _prepare_4d_attention_mask( - encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] - ) + if self.attn_implementation == "flash_attention_2": + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self.attn_implementation == "sdpa" and cross_attn_head_mask is None and not output_attentions: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) # embed positions positions = self.embed_positions(input, past_key_values_length) @@ -1409,6 +1793,8 @@ class MusicgenForConditionalGeneration(PreTrainedModel): base_model_prefix = "encoder_decoder" main_input_name = "input_ids" supports_gradient_checkpointing = True + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, diff --git a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py index 8945937129..335c051416 100644 --- a/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py @@ -21,9 +21,7 @@ from ..auto.configuration_auto import AutoConfig logger = logging.get_logger(__name__) -MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP = { - "facebook/musicgen-melody": "https://huggingface.co/facebook/musicgen-melody/resolve/main/config.json", -} +from ..deprecated._archive_maps import MUSICGEN_MELODY_PRETRAINED_CONFIG_ARCHIVE_MAP # noqa: F401, E402 class MusicgenMelodyDecoderConfig(PretrainedConfig): @@ -254,3 +252,20 @@ class MusicgenMelodyConfig(PretrainedConfig): # This is a property because you might want to change the codec model on the fly def sampling_rate(self): return self.audio_encoder.sampling_rate + + @property + def _attn_implementation(self): + # This property is made private for now (as it cannot be changed and a PreTrainedModel.use_attn_implementation method needs to be implemented.) + if hasattr(self, "_attn_implementation_internal"): + if self._attn_implementation_internal is None: + # `config.attn_implementation` should never be None, for backward compatibility. + return "eager" + else: + return self._attn_implementation_internal + else: + return "eager" + + @_attn_implementation.setter + def _attn_implementation(self, value): + self._attn_implementation_internal = value + self.decoder._attn_implementation = value diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 8b5c5c2f57..8b0afb2367 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -22,13 +22,14 @@ from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple, Union import torch import torch.nn as nn +import torch.nn.functional as F from torch.nn import CrossEntropyLoss from ...activations import ACT2FN from ...generation.configuration_utils import GenerationConfig from ...generation.logits_process import ClassifierFreeGuidanceLogitsProcessor, LogitsProcessorList from ...generation.stopping_criteria import StoppingCriteriaList -from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask +from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa from ...modeling_outputs import ( BaseModelOutputWithPast, ModelOutput, @@ -37,6 +38,8 @@ from ...modeling_utils import PreTrainedModel from ...utils import ( add_start_docstrings, add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, logging, replace_return_docstrings, ) @@ -45,6 +48,10 @@ from ..auto.modeling_auto import AutoModel, AutoModelForTextEncoding from .configuration_musicgen_melody import MusicgenMelodyConfig, MusicgenMelodyDecoderConfig +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + if TYPE_CHECKING: from ...generation.streamers import BaseStreamer @@ -53,10 +60,20 @@ logger = logging.get_logger(__name__) _CONFIG_FOR_DOC = "MusicgenMelodyConfig" _CHECKPOINT_FOR_DOC = "facebook/musicgen-melody" -MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST = [ - "facebook/musicgen-melody", - # See all Musicgen Melody models at https://huggingface.co/models?filter=musicgen_melody -] +from ..deprecated._archive_maps import MUSICGEN_MELODY_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402 + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) @dataclass @@ -324,17 +341,348 @@ class MusicgenMelodyAttention(nn.Module): return attn_output, attn_weights_reshaped, past_key_value +# Copied from transformers.models.bart.modeling_bart.BartFlashAttention2 with Bart->MusicgenMelody +class MusicgenMelodyFlashAttention2(MusicgenMelodyAttention): + """ + MusicgenMelody flash attention module. This module inherits from `MusicgenMelodyAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # MusicgenMelodyFlashAttention2 attention does not support output_attentions + if output_attentions: + raise ValueError("MusicgenMelodyFlashAttention2 attention does not support output_attentions") + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, q_len, _ = hidden_states.size() + + # get query proj + query_states = self._reshape(self.q_proj(hidden_states), -1, bsz) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0].transpose(1, 2) + value_states = past_key_value[1].transpose(1, 2) + elif is_cross_attention: + # cross_attentions + key_states = self._reshape(self.k_proj(key_value_states), -1, bsz) + value_states = self._reshape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) + value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) + else: + # self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=self.dropout + ) + + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +# Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->MusicgenMelody +class MusicgenMelodySdpaAttention(MusicgenMelodyAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "MusicgenMelodyModel is using MusicgenMelodySdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + query_states = self._shape(query_states, tgt_len, bsz) + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.dropout if self.training else 0.0, + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal=self.is_causal and attention_mask is None and tgt_len > 1, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + + +MUSICGEN_MELODY_ATTENTION_CLASSES = { + "eager": MusicgenMelodyAttention, + "sdpa": MusicgenMelodySdpaAttention, + "flash_attention_2": MusicgenMelodyFlashAttention2, +} + + class MusicgenMelodyDecoderLayer(nn.Module): def __init__(self, config: MusicgenMelodyDecoderConfig): super().__init__() self.embed_dim = config.hidden_size - self.self_attn = MusicgenMelodyAttention( + self.self_attn = MUSICGEN_MELODY_ATTENTION_CLASSES[config._attn_implementation]( embed_dim=self.embed_dim, num_heads=config.num_attention_heads, dropout=config.attention_dropout, is_decoder=True, bias=False, + is_causal=True, + config=config, ) self.dropout = config.dropout self.activation_fn = ACT2FN[config.activation_function] @@ -414,6 +762,8 @@ class MusicgenMelodyPreTrainedModel(PreTrainedModel): base_model_prefix = "model" supports_gradient_checkpointing = True _no_split_modules = ["MusicgenMelodyDecoderLayer", "MusicgenMelodyAttention"] + _supports_flash_attn_2 = True + _supports_sdpa = True def _init_weights(self, module): std = self.config.initializer_factor @@ -626,6 +976,7 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel): self.layers = nn.ModuleList([MusicgenMelodyDecoderLayer(config) for _ in range(config.num_hidden_layers)]) self.layer_norm = nn.LayerNorm(config.hidden_size) + self.attn_implementation = config._attn_implementation self.gradient_checkpointing = False # Initialize weights and apply final processing @@ -695,9 +1046,21 @@ class MusicgenMelodyDecoder(MusicgenMelodyPreTrainedModel): input_shape = inputs_embeds.size()[:-1] - attention_mask = _prepare_4d_causal_attention_mask( - attention_mask, input_shape, inputs_embeds, past_key_values_length - ) + if self.attn_implementation == "flash_attention_2": + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self.attn_implementation == "sdpa" and not output_attentions: + # output_attentions=True can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + inputs_embeds, + past_key_values_length, + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) # embed positions positions = self.embed_positions(inputs_embeds, past_key_values_length) @@ -1373,6 +1736,8 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): config_class = MusicgenMelodyConfig main_input_name = "input_ids" supports_gradient_checkpointing = True + _supports_flash_attn_2 = True + _supports_sdpa = True def __init__( self, diff --git a/tests/models/musicgen/test_modeling_musicgen.py b/tests/models/musicgen/test_modeling_musicgen.py index adc3bf234e..df1df64c9c 100644 --- a/tests/models/musicgen/test_modeling_musicgen.py +++ b/tests/models/musicgen/test_modeling_musicgen.py @@ -16,9 +16,12 @@ import copy import inspect import math +import tempfile import unittest import numpy as np +from parameterized import parameterized +from pytest import mark from transformers import ( EncodecConfig, @@ -30,12 +33,15 @@ from transformers import ( ) from transformers.testing_utils import ( is_torch_available, + require_flash_attn, require_torch, require_torch_fp16, + require_torch_gpu, + require_torch_sdpa, slow, torch_device, ) -from transformers.utils import cached_property +from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -277,6 +283,615 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste self.assertNotIn(config.pad_token_id, output_generate) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence + def test_flash_attn_2_inference_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, 1:] = 1 + dummy_attention_mask[:, :1] = 0 + + # Ignore copy + outputs = model(dummy_input, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + + # Ignore copy + other_inputs = { + "output_hidden_states": True, + } + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + + outputs = model(dummy_input, **other_inputs) + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2) + + # check with inference + dropout + model.train() + _ = model_fa(dummy_input, **other_inputs) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_inference_equivalence_right_padding(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + if model.config.is_encoder_decoder: + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + else: + outputs = model(dummy_input, output_hidden_states=True) + outputs_fa = model_fa(dummy_input, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + # Ignore copy + other_inputs = { + "output_hidden_states": True, + } + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + + outputs = model(dummy_input, **other_inputs) + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + def test_flash_attn_2_generate_left_padding(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + # make sure we do left padding + dummy_attention_mask[:, :-1] = 0 + dummy_attention_mask[:, -1:] = 1 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + def test_flash_attn_2_generate_padding_right(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + # make sure we do right padding + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache + def test_flash_attn_2_generate_use_cache(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + # Just test that a large cache works as expected + _ = model.generate( + dummy_input, + attention_mask=dummy_attention_mask, + max_new_tokens=max_new_tokens, + do_sample=False, + use_cache=True, + ) + + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + if not self.all_model_classes[0]._supports_sdpa: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): + self.skipTest( + f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" + ) + + # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. + if torch_dtype == "float16": + torch_dtype = torch.float16 + elif torch_dtype == "bfloat16": + torch_dtype = torch.bfloat16 + elif torch_dtype == "float32": + torch_dtype = torch.float32 + + atols = { + ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-6, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-6, + ("cuda", True, torch.bfloat16): 1e-2, + ("cuda", True, torch.float16): 5e-3, + } + rtols = { + ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-4, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-4, + ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.float16): 5e-3, + } + + def get_mean_reldiff(failcase, x, ref, atol, rtol): + return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + is_encoder_decoder = model.config.is_encoder_decoder + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch_dtype, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, + # but it would be nicer to have an efficient way to use parameterized.expand + fail_cases = [] + for padding_side in ["left", "right"]: + for use_mask in [False, True]: + for batch_size in [1, 5]: + # Ignore copy + batch_size_input_ids = self.model_tester.num_codebooks * batch_size + dummy_input = inputs_dict[model.main_input_name] + + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + dummy_input = dummy_input.to(torch_dtype) + + # Ignore copy + dummy_input = dummy_input[:batch_size_input_ids] + # Ignore copy + if dummy_input.shape[0] != batch_size_input_ids: + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + # Ignore copy + extension = torch.rand( + batch_size_input_ids - dummy_input.shape[0], + *dummy_input.shape[1:], + dtype=torch_dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + else: + # Ignore copy + extension = torch.randint( + high=5, + size=(batch_size_input_ids - dummy_input.shape[0], *dummy_input.shape[1:]), + dtype=dummy_input.dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + + if not use_mask: + dummy_attention_mask = None + else: + dummy_attention_mask = inputs_dict.get("attention_mask", None) + if dummy_attention_mask is None: + if is_encoder_decoder: + seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] + else: + seqlen = dummy_input.shape[-1] + dummy_attention_mask = ( + torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) + ) + + dummy_attention_mask = dummy_attention_mask[:batch_size] + if dummy_attention_mask.shape[0] != batch_size: + extension = torch.ones( + batch_size - dummy_attention_mask.shape[0], + *dummy_attention_mask.shape[1:], + dtype=dummy_attention_mask.dtype, + device=torch_device, + ) + dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) + dummy_attention_mask = dummy_attention_mask.to(torch_device) + + dummy_attention_mask[:] = 1 + if padding_side == "left": + dummy_attention_mask[-1, :-1] = 1 + dummy_attention_mask[-1, -4:] = 0 + elif padding_side == "right": + dummy_attention_mask[-1, 1:] = 1 + dummy_attention_mask[-1, :3] = 0 + + for enable_kernels in [False, True]: + failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" + + other_inputs = { + "output_hidden_states": True, + } + + # Otherwise fails for e.g. WhisperEncoderModel + if "attention_mask" in inspect.signature(model_eager.forward).parameters: + other_inputs["attention_mask"] = dummy_attention_mask + + # TODO: test gradients as well (& for FA2 as well!) + with torch.no_grad(): + with torch.backends.cuda.sdp_kernel( + enable_flash=enable_kernels, + enable_math=True, + enable_mem_efficient=enable_kernels, + ): + outputs_eager = model_eager(dummy_input, **other_inputs) + outputs_sdpa = model_sdpa(dummy_input, **other_inputs) + + logits_eager = ( + outputs_eager.hidden_states[-1] + if not is_encoder_decoder + else outputs_eager.decoder_hidden_states[-1] + ) + logits_sdpa = ( + outputs_sdpa.hidden_states[-1] + if not is_encoder_decoder + else outputs_sdpa.decoder_hidden_states[-1] + ) + + if torch_device in ["cpu", "cuda"]: + atol = atols[torch_device, enable_kernels, torch_dtype] + rtol = rtols[torch_device, enable_kernels, torch_dtype] + else: + atol = 1e-7 + rtol = 1e-4 + + # Masked tokens output slightly deviates - we don't mind that. + if use_mask: + if padding_side == "left": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, :-4] + sub_eager = logits_eager[-1, :-4] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, -4:] + # sub_eager = logits_eager[-1, -4:] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + elif padding_side == "right": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, 3:] + sub_eager = logits_eager[-1, 3:] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, :3] + # sub_eager = logits_eager[-1, :3] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + + else: + if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) + + self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) + + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + def prepare_musicgen_inputs_dict( config, @@ -941,6 +1556,639 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, self.assertNotIn(config.pad_token_id, output_generate) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence + def test_flash_attn_2_inference_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, 1:] = 1 + dummy_attention_mask[:, :1] = 0 + + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + # Ignore copy + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + # Ignore copy + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + # Ignore copy + outputs = model(dummy_input, **other_inputs) + # Ignore copy + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2) + + # check with inference + dropout + model.train() + _ = model_fa(dummy_input, **other_inputs) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_inference_equivalence_right_padding(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + # Ignore copy + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + # Ignore copy + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + # Ignore copy + outputs = model(dummy_input, **other_inputs) + # Ignore copy + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + def test_flash_attn_2_generate_left_padding(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask") + if dummy_attention_mask is None: + dummy_attention_mask = torch.ones_like(dummy_input) + + # make sure we do left padding + dummy_attention_mask[:, :-1] = 0 + dummy_attention_mask[:, -1:] = 1 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + def test_flash_attn_2_generate_padding_right(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask") + if dummy_attention_mask is None: + dummy_attention_mask = torch.ones_like(dummy_input) + # make sure we do right padding + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache + def test_flash_attn_2_generate_use_cache(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + # Just test that a large cache works as expected + _ = model.generate( + dummy_input, + attention_mask=dummy_attention_mask, + max_new_tokens=max_new_tokens, + do_sample=False, + use_cache=True, + ) + + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + if not self.all_model_classes[0]._supports_sdpa: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): + self.skipTest( + f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" + ) + + # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. + if torch_dtype == "float16": + torch_dtype = torch.float16 + elif torch_dtype == "bfloat16": + torch_dtype = torch.bfloat16 + elif torch_dtype == "float32": + torch_dtype = torch.float32 + + atols = { + ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-6, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-6, + ("cuda", True, torch.bfloat16): 1e-2, + ("cuda", True, torch.float16): 5e-3, + } + rtols = { + ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-4, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-4, + ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.float16): 5e-3, + } + + def get_mean_reldiff(failcase, x, ref, atol, rtol): + return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + is_encoder_decoder = model.config.is_encoder_decoder + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch_dtype, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, + # but it would be nicer to have an efficient way to use parameterized.expand + fail_cases = [] + for padding_side in ["left", "right"]: + for use_mask in [False, True]: + for batch_size in [1, 5]: + dummy_input = inputs_dict[model.main_input_name] + + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + dummy_input = dummy_input.to(torch_dtype) + + dummy_input = dummy_input[:batch_size] + if dummy_input.shape[0] != batch_size: + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + extension = torch.rand( + batch_size - dummy_input.shape[0], + *dummy_input.shape[1:], + dtype=torch_dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + else: + extension = torch.randint( + high=5, + size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]), + dtype=dummy_input.dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + + if not use_mask: + dummy_attention_mask = None + else: + dummy_attention_mask = inputs_dict.get("attention_mask", None) + if dummy_attention_mask is None: + # Ignore copy + seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] + # Ignore copy + dummy_attention_mask = ( + torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) + ) + + dummy_attention_mask = dummy_attention_mask[:batch_size] + if dummy_attention_mask.shape[0] != batch_size: + extension = torch.ones( + batch_size - dummy_attention_mask.shape[0], + *dummy_attention_mask.shape[1:], + dtype=dummy_attention_mask.dtype, + device=torch_device, + ) + dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) + dummy_attention_mask = dummy_attention_mask.to(torch_device) + + dummy_attention_mask[:] = 1 + if padding_side == "left": + dummy_attention_mask[-1, :-1] = 1 + dummy_attention_mask[-1, -4:] = 0 + elif padding_side == "right": + dummy_attention_mask[-1, 1:] = 1 + dummy_attention_mask[-1, :3] = 0 + + for enable_kernels in [False, True]: + failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" + # Ignore copy + batch_size_input_ids = self.model_tester.num_codebooks * batch_size + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[ + :batch_size_input_ids + ] + # Ignore copy + if decoder_input_ids.shape[0] != batch_size_input_ids: + # Ignore copy + extension = torch.ones( + batch_size_input_ids - decoder_input_ids.shape[0], + *decoder_input_ids.shape[1:], + dtype=decoder_input_ids.dtype, + device=torch_device, + ) + decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0) + decoder_input_ids = decoder_input_ids.to(torch_device) + + # TODO: never an `attention_mask` arg here? + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + + # TODO: test gradients as well (& for FA2 as well!) + # Ignore copy + with torch.no_grad(): + with torch.backends.cuda.sdp_kernel( + enable_flash=enable_kernels, + enable_math=True, + enable_mem_efficient=enable_kernels, + ): + outputs_eager = model_eager(dummy_input, **other_inputs) + outputs_sdpa = model_sdpa(dummy_input, **other_inputs) + + logits_eager = ( + outputs_eager.hidden_states[-1] + if not is_encoder_decoder + else outputs_eager.decoder_hidden_states[-1] + ) + logits_sdpa = ( + outputs_sdpa.hidden_states[-1] + if not is_encoder_decoder + else outputs_sdpa.decoder_hidden_states[-1] + ) + + if torch_device in ["cpu", "cuda"]: + atol = atols[torch_device, enable_kernels, torch_dtype] + rtol = rtols[torch_device, enable_kernels, torch_dtype] + else: + atol = 1e-7 + rtol = 1e-4 + + # Masked tokens output slightly deviates - we don't mind that. + if use_mask: + if padding_side == "left": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, :-4] + sub_eager = logits_eager[-1, :-4] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, -4:] + # sub_eager = logits_eager[-1, -4:] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + elif padding_side == "right": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, 3:] + sub_eager = logits_eager[-1, 3:] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, :3] + # sub_eager = logits_eager[-1, :3] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + + else: + if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) + + self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) + + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000): """Produces a series of 'bip bip' sounds at a given frequency.""" diff --git a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py index 7bb346d8ab..667958a251 100644 --- a/tests/models/musicgen_melody/test_modeling_musicgen_melody.py +++ b/tests/models/musicgen_melody/test_modeling_musicgen_melody.py @@ -16,9 +16,12 @@ import copy import inspect import math +import tempfile import unittest import numpy as np +from parameterized import parameterized +from pytest import mark from transformers import ( EncodecConfig, @@ -30,13 +33,16 @@ from transformers import ( from transformers.testing_utils import ( is_torch_available, is_torchaudio_available, + require_flash_attn, require_torch, require_torch_fp16, + require_torch_gpu, + require_torch_sdpa, require_torchaudio, slow, torch_device, ) -from transformers.utils import cached_property +from transformers.utils import cached_property, is_torch_bf16_available_on_device, is_torch_fp16_available_on_device from ...generation.test_utils import GenerationTesterMixin from ...test_configuration_common import ConfigTester @@ -277,6 +283,615 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes self.assertIsInstance(output_generate, GenerateDecoderOnlyOutput) self.assertNotIn(config.pad_token_id, output_generate) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_inference_equivalence + def test_flash_attn_2_inference_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, 1:] = 1 + dummy_attention_mask[:, :1] = 0 + + # Ignore copy + outputs = model(dummy_input, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + + # Ignore copy + other_inputs = { + "output_hidden_states": True, + } + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + + outputs = model(dummy_input, **other_inputs) + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2) + + # check with inference + dropout + model.train() + _ = model_fa(dummy_input, **other_inputs) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_inference_equivalence_right_padding(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + if model.config.is_encoder_decoder: + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + else: + outputs = model(dummy_input, output_hidden_states=True) + outputs_fa = model_fa(dummy_input, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + # Ignore copy + other_inputs = { + "output_hidden_states": True, + } + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + + outputs = model(dummy_input, **other_inputs) + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + def test_flash_attn_2_generate_left_padding(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + # make sure we do left padding + dummy_attention_mask[:, :-1] = 0 + dummy_attention_mask[:, -1:] = 1 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + def test_flash_attn_2_generate_padding_right(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + # make sure we do right padding + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_generate_use_cache + def test_flash_attn_2_generate_use_cache(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + # Just test that a large cache works as expected + _ = model.generate( + dummy_input, + attention_mask=dummy_attention_mask, + max_new_tokens=max_new_tokens, + do_sample=False, + use_cache=True, + ) + + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_inference + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + if not self.all_model_classes[0]._supports_sdpa: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): + self.skipTest( + f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" + ) + + # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. + if torch_dtype == "float16": + torch_dtype = torch.float16 + elif torch_dtype == "bfloat16": + torch_dtype = torch.bfloat16 + elif torch_dtype == "float32": + torch_dtype = torch.float32 + + atols = { + ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-6, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-6, + ("cuda", True, torch.bfloat16): 1e-2, + ("cuda", True, torch.float16): 5e-3, + } + rtols = { + ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-4, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-4, + ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.float16): 5e-3, + } + + def get_mean_reldiff(failcase, x, ref, atol, rtol): + return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + is_encoder_decoder = model.config.is_encoder_decoder + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch_dtype, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, + # but it would be nicer to have an efficient way to use parameterized.expand + fail_cases = [] + for padding_side in ["left", "right"]: + for use_mask in [False, True]: + for batch_size in [1, 5]: + # Ignore copy + batch_size_input_ids = self.model_tester.num_codebooks * batch_size + dummy_input = inputs_dict[model.main_input_name] + + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + dummy_input = dummy_input.to(torch_dtype) + + # Ignore copy + dummy_input = dummy_input[:batch_size_input_ids] + # Ignore copy + if dummy_input.shape[0] != batch_size_input_ids: + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + # Ignore copy + extension = torch.rand( + batch_size_input_ids - dummy_input.shape[0], + *dummy_input.shape[1:], + dtype=torch_dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + else: + # Ignore copy + extension = torch.randint( + high=5, + size=(batch_size_input_ids - dummy_input.shape[0], *dummy_input.shape[1:]), + dtype=dummy_input.dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + + if not use_mask: + dummy_attention_mask = None + else: + dummy_attention_mask = inputs_dict.get("attention_mask", None) + if dummy_attention_mask is None: + if is_encoder_decoder: + seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] + else: + seqlen = dummy_input.shape[-1] + dummy_attention_mask = ( + torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) + ) + + dummy_attention_mask = dummy_attention_mask[:batch_size] + if dummy_attention_mask.shape[0] != batch_size: + extension = torch.ones( + batch_size - dummy_attention_mask.shape[0], + *dummy_attention_mask.shape[1:], + dtype=dummy_attention_mask.dtype, + device=torch_device, + ) + dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) + dummy_attention_mask = dummy_attention_mask.to(torch_device) + + dummy_attention_mask[:] = 1 + if padding_side == "left": + dummy_attention_mask[-1, :-1] = 1 + dummy_attention_mask[-1, -4:] = 0 + elif padding_side == "right": + dummy_attention_mask[-1, 1:] = 1 + dummy_attention_mask[-1, :3] = 0 + + for enable_kernels in [False, True]: + failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" + + other_inputs = { + "output_hidden_states": True, + } + + # Otherwise fails for e.g. WhisperEncoderModel + if "attention_mask" in inspect.signature(model_eager.forward).parameters: + other_inputs["attention_mask"] = dummy_attention_mask + + # TODO: test gradients as well (& for FA2 as well!) + with torch.no_grad(): + with torch.backends.cuda.sdp_kernel( + enable_flash=enable_kernels, + enable_math=True, + enable_mem_efficient=enable_kernels, + ): + outputs_eager = model_eager(dummy_input, **other_inputs) + outputs_sdpa = model_sdpa(dummy_input, **other_inputs) + + logits_eager = ( + outputs_eager.hidden_states[-1] + if not is_encoder_decoder + else outputs_eager.decoder_hidden_states[-1] + ) + logits_sdpa = ( + outputs_sdpa.hidden_states[-1] + if not is_encoder_decoder + else outputs_sdpa.decoder_hidden_states[-1] + ) + + if torch_device in ["cpu", "cuda"]: + atol = atols[torch_device, enable_kernels, torch_dtype] + rtol = rtols[torch_device, enable_kernels, torch_dtype] + else: + atol = 1e-7 + rtol = 1e-4 + + # Masked tokens output slightly deviates - we don't mind that. + if use_mask: + if padding_side == "left": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, :-4] + sub_eager = logits_eager[-1, :-4] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, -4:] + # sub_eager = logits_eager[-1, -4:] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + elif padding_side == "right": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, 3:] + sub_eager = logits_eager[-1, 3:] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, :3] + # sub_eager = logits_eager[-1, :3] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + + else: + if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) + + self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) + + @require_torch_sdpa + @slow + # Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_generate + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + def prepare_musicgen_melody_inputs_dict( config, @@ -923,6 +1538,639 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester self.assertNotIn(config.pad_token_id, output_generate) + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence + def test_flash_attn_2_inference_equivalence(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, 1:] = 1 + dummy_attention_mask[:, :1] = 0 + + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + # Ignore copy + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + # Ignore copy + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + # Ignore copy + outputs = model(dummy_input, **other_inputs) + # Ignore copy + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[1:], logits[1:], atol=4e-2, rtol=4e-2) + + # check with inference + dropout + model.train() + _ = model_fa(dummy_input, **other_inputs) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding + def test_flash_attn_2_inference_equivalence_right_padding(self): + for model_class in self.all_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_fa = model_class.from_pretrained( + tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2" + ) + model_fa.to(torch_device) + + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.bfloat16) + model.to(torch_device) + + # Ignore copy + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.float16]: + dummy_input = dummy_input.to(torch.bfloat16) + + dummy_attention_mask = inputs_dict.get("attention_mask", None) + + if dummy_attention_mask is not None: + # Ignore copy + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input) + # Ignore copy + outputs = model(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + # Ignore copy + outputs_fa = model_fa(dummy_input, decoder_input_ids=decoder_input_ids, output_hidden_states=True) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa, logits, atol=4e-2, rtol=4e-2) + + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + # Ignore copy + if dummy_attention_mask is not None: + other_inputs["attention_mask"] = dummy_attention_mask + # Ignore copy + outputs = model(dummy_input, **other_inputs) + # Ignore copy + outputs_fa = model_fa(dummy_input, **other_inputs) + + logits = ( + outputs.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs.decoder_hidden_states[-1] + ) + logits_fa = ( + outputs_fa.hidden_states[-1] + if not model.config.is_encoder_decoder + else outputs_fa.decoder_hidden_states[-1] + ) + + assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding + def test_flash_attn_2_generate_left_padding(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask") + if dummy_attention_mask is None: + dummy_attention_mask = torch.ones_like(dummy_input) + + # make sure we do left padding + dummy_attention_mask[:, :-1] = 0 + dummy_attention_mask[:, -1:] = 1 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right + def test_flash_attn_2_generate_padding_right(self): + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( + torch_device + ) + + dummy_input = inputs_dict[model.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + dummy_attention_mask = inputs_dict.get("attention_mask") + if dummy_attention_mask is None: + dummy_attention_mask = torch.ones_like(dummy_input) + # make sure we do right padding + dummy_attention_mask[:, :-1] = 1 + dummy_attention_mask[:, -1:] = 0 + + out = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + out_fa = model.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False + ) + + self.assertTrue(torch.allclose(out, out_fa)) + + @require_flash_attn + @require_torch_gpu + @mark.flash_attn_test + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache + def test_flash_attn_2_generate_use_cache(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_flash_attn_2: + self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + attn_implementation="flash_attention_2", + low_cpu_mem_usage=True, + ).to(torch_device) + + # Just test that a large cache works as expected + _ = model.generate( + dummy_input, + attention_mask=dummy_attention_mask, + max_new_tokens=max_new_tokens, + do_sample=False, + use_cache=True, + ) + + @parameterized.expand([("float16",), ("bfloat16",), ("float32",)]) + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference + def test_eager_matches_sdpa_inference(self, torch_dtype: str): + if not self.all_model_classes[0]._supports_sdpa: + self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA") + + if torch_dtype == "float16" and not is_torch_fp16_available_on_device(torch_device): + self.skipTest(f"float16 not supported on {torch_device} (on the specific device currently used)") + + if torch_dtype == "bfloat16" and not is_torch_bf16_available_on_device(torch_device): + self.skipTest( + f"bfloat16 not supported on {torch_device} (on the specific device currently used, e.g. Nvidia T4 GPU)" + ) + + # Not sure whether it's fine to put torch.XXX in a decorator if torch is not available so hacking it here instead. + if torch_dtype == "float16": + torch_dtype = torch.float16 + elif torch_dtype == "bfloat16": + torch_dtype = torch.bfloat16 + elif torch_dtype == "float32": + torch_dtype = torch.float32 + + atols = { + ("cpu", False, torch.float32): 1e-6, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-6, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-6, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-6, + ("cuda", True, torch.bfloat16): 1e-2, + ("cuda", True, torch.float16): 5e-3, + } + rtols = { + ("cpu", False, torch.float32): 1e-4, + ("cpu", False, torch.bfloat16): 1e-2, + ("cpu", True, torch.float32): 1e-4, + ("cpu", True, torch.bfloat16): 1e-2, + ("cuda", False, torch.float32): 1e-4, + ("cuda", False, torch.bfloat16): 1e-2, + ("cuda", False, torch.float16): 5e-3, + ("cuda", True, torch.float32): 1e-4, + ("cuda", True, torch.bfloat16): 3e-2, + ("cuda", True, torch.float16): 5e-3, + } + + def get_mean_reldiff(failcase, x, ref, atol, rtol): + return f"{failcase}: mean relative difference: {((x - ref).abs() / (ref.abs() + 1e-12)).mean():.3e}, torch atol = {atol}, torch rtol = {rtol}" + + for model_class in self.all_model_classes: + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + model = model_class(config) + + is_encoder_decoder = model.config.is_encoder_decoder + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype) + model_sdpa = model_sdpa.eval().to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch_dtype, + attn_implementation="eager", + ) + model_eager = model_eager.eval().to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa and model_sdpa.config.model_type != "falcon": + raise ValueError("The SDPA model should have SDPA attention layers") + + # We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model, + # but it would be nicer to have an efficient way to use parameterized.expand + fail_cases = [] + for padding_side in ["left", "right"]: + for use_mask in [False, True]: + for batch_size in [1, 5]: + dummy_input = inputs_dict[model.main_input_name] + + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + dummy_input = dummy_input.to(torch_dtype) + + dummy_input = dummy_input[:batch_size] + if dummy_input.shape[0] != batch_size: + if dummy_input.dtype in [torch.float32, torch.bfloat16, torch.float16]: + extension = torch.rand( + batch_size - dummy_input.shape[0], + *dummy_input.shape[1:], + dtype=torch_dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + else: + extension = torch.randint( + high=5, + size=(batch_size - dummy_input.shape[0], *dummy_input.shape[1:]), + dtype=dummy_input.dtype, + device=torch_device, + ) + dummy_input = torch.cat((dummy_input, extension), dim=0).to(torch_device) + + if not use_mask: + dummy_attention_mask = None + else: + dummy_attention_mask = inputs_dict.get("attention_mask", None) + if dummy_attention_mask is None: + # Ignore copy + seqlen = inputs_dict.get("decoder_input_ids", dummy_input).shape[-1] + # Ignore copy + dummy_attention_mask = ( + torch.ones(batch_size, seqlen).to(torch.int64).to(torch_device) + ) + + dummy_attention_mask = dummy_attention_mask[:batch_size] + if dummy_attention_mask.shape[0] != batch_size: + extension = torch.ones( + batch_size - dummy_attention_mask.shape[0], + *dummy_attention_mask.shape[1:], + dtype=dummy_attention_mask.dtype, + device=torch_device, + ) + dummy_attention_mask = torch.cat((dummy_attention_mask, extension), dim=0) + dummy_attention_mask = dummy_attention_mask.to(torch_device) + + dummy_attention_mask[:] = 1 + if padding_side == "left": + dummy_attention_mask[-1, :-1] = 1 + dummy_attention_mask[-1, -4:] = 0 + elif padding_side == "right": + dummy_attention_mask[-1, 1:] = 1 + dummy_attention_mask[-1, :3] = 0 + + for enable_kernels in [False, True]: + failcase = f"padding_side={padding_side}, use_mask={use_mask}, batch_size={batch_size}, enable_kernels={enable_kernels}" + # Ignore copy + batch_size_input_ids = self.model_tester.num_codebooks * batch_size + # Ignore copy + decoder_input_ids = inputs_dict.get("decoder_input_ids", dummy_input)[ + :batch_size_input_ids + ] + # Ignore copy + if decoder_input_ids.shape[0] != batch_size_input_ids: + # Ignore copy + extension = torch.ones( + batch_size_input_ids - decoder_input_ids.shape[0], + *decoder_input_ids.shape[1:], + dtype=decoder_input_ids.dtype, + device=torch_device, + ) + decoder_input_ids = torch.cat((decoder_input_ids, extension), dim=0) + decoder_input_ids = decoder_input_ids.to(torch_device) + + # TODO: never an `attention_mask` arg here? + # Ignore copy + other_inputs = { + "decoder_input_ids": decoder_input_ids, + "decoder_attention_mask": dummy_attention_mask, + "output_hidden_states": True, + } + + # TODO: test gradients as well (& for FA2 as well!) + # Ignore copy + with torch.no_grad(): + with torch.backends.cuda.sdp_kernel( + enable_flash=enable_kernels, + enable_math=True, + enable_mem_efficient=enable_kernels, + ): + outputs_eager = model_eager(dummy_input, **other_inputs) + outputs_sdpa = model_sdpa(dummy_input, **other_inputs) + + logits_eager = ( + outputs_eager.hidden_states[-1] + if not is_encoder_decoder + else outputs_eager.decoder_hidden_states[-1] + ) + logits_sdpa = ( + outputs_sdpa.hidden_states[-1] + if not is_encoder_decoder + else outputs_sdpa.decoder_hidden_states[-1] + ) + + if torch_device in ["cpu", "cuda"]: + atol = atols[torch_device, enable_kernels, torch_dtype] + rtol = rtols[torch_device, enable_kernels, torch_dtype] + else: + atol = 1e-7 + rtol = 1e-4 + + # Masked tokens output slightly deviates - we don't mind that. + if use_mask: + if padding_side == "left": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, :-4] + sub_eager = logits_eager[-1, :-4] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, -4:] + # sub_eager = logits_eager[-1, -4:] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + elif padding_side == "right": + sub_sdpa = logits_sdpa[:-1] + sub_eager = logits_eager[:-1] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + sub_sdpa = logits_sdpa[-1, 3:] + sub_eager = logits_eager[-1, 3:] + if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, sub_sdpa, sub_eager, atol, rtol) + ) + + # Testing the padding tokens is not really meaningful but anyway + # sub_sdpa = logits_sdpa[-1, :3] + # sub_eager = logits_eager[-1, :3] + # if not torch.allclose(sub_sdpa, sub_eager, atol=atol, rtol=rtol): + # fail_cases.append(get_mean_reldiff(failcase, sub_sdpa, sub_eager, 4e-2, 4e-2)) + + else: + if not torch.allclose(logits_sdpa, logits_eager, atol=atol, rtol=rtol): + fail_cases.append( + get_mean_reldiff(failcase, logits_sdpa, logits_eager, atol, rtol) + ) + + self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases)) + + @require_torch_sdpa + @slow + # Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_generate + def test_eager_matches_sdpa_generate(self): + max_new_tokens = 30 + + # Ignore copy + for model_class in self.greedy_sample_model_classes: + if not model_class._supports_sdpa: + self.skipTest(f"{model_class.__name__} does not support SDPA") + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + dummy_input = inputs_dict[model_class.main_input_name] + if dummy_input.dtype in [torch.float32, torch.bfloat16]: + dummy_input = dummy_input.to(torch.float16) + + # make sure that all models have enough positions for generation + if hasattr(config, "max_position_embeddings"): + config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1 + + model = model_class(config) + + with tempfile.TemporaryDirectory() as tmpdirname: + model.save_pretrained(tmpdirname) + + dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + + model_sdpa = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + ).to(torch_device) + + self.assertTrue(model_sdpa.config._attn_implementation == "sdpa") + + model_eager = model_class.from_pretrained( + tmpdirname, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + attn_implementation="eager", + ).to(torch_device) + + self.assertTrue(model_eager.config._attn_implementation == "eager") + + for name, submodule in model_eager.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + raise ValueError("The eager model should not have SDPA attention layers") + + has_sdpa = False + for name, submodule in model_sdpa.named_modules(): + if "SdpaAttention" in submodule.__class__.__name__: + has_sdpa = True + break + if not has_sdpa: + raise ValueError("The SDPA model should have SDPA attention layers") + + # Just test that a large cache works as expected + res_eager = model_eager.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + res_sdpa = model_sdpa.generate( + dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False + ) + + self.assertTrue(torch.allclose(res_eager, res_sdpa)) + # Copied from tests.models.musicgen.test_modeling_musicgen.get_bip_bip def get_bip_bip(bip_duration=0.125, duration=0.5, sample_rate=32000): From cb5927ca8f4c922365cebf08ae66566e65443a52 Mon Sep 17 00:00:00 2001 From: Michael Date: Tue, 2 Apr 2024 19:37:56 +0800 Subject: [PATCH 004/110] [Docs] Make an ordered list prettier in add_tensorflow_model.md (#29949) --- docs/source/en/add_tensorflow_model.md | 62 +++++++++++++------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/docs/source/en/add_tensorflow_model.md b/docs/source/en/add_tensorflow_model.md index 52c7e3b1ad..23a1e2d170 100644 --- a/docs/source/en/add_tensorflow_model.md +++ b/docs/source/en/add_tensorflow_model.md @@ -109,52 +109,52 @@ instructions below to set up your environment and open a draft PR. 2. Clone your `transformers` fork to your local disk, and add the base repository as a remote: -```bash -git clone https://github.com/[your Github handle]/transformers.git -cd transformers -git remote add upstream https://github.com/huggingface/transformers.git -``` + ```bash + git clone https://github.com/[your Github handle]/transformers.git + cd transformers + git remote add upstream https://github.com/huggingface/transformers.git + ``` -3. Set up a development environment, for instance by running the following command: +3. Set up a development environment, for instance by running the following commands: -```bash -python -m venv .env -source .env/bin/activate -pip install -e ".[dev]" -``` + ```bash + python -m venv .env + source .env/bin/activate + pip install -e ".[dev]" + ``` -Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a -failure with this command. If that's the case make sure to install TensorFlow then do: + Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a + failure with this command. If that's the case make sure to install TensorFlow then do: -```bash -pip install -e ".[quality]" -``` + ```bash + pip install -e ".[quality]" + ``` -**Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. + **Note:** You don't need to have CUDA installed. Making the new model work on CPU is sufficient. -4. Create a branch with a descriptive name from your main branch +4. Create a branch with a descriptive name from your main branch: -```bash -git checkout -b add_tf_brand_new_bert -``` + ```bash + git checkout -b add_tf_brand_new_bert + ``` -5. Fetch and rebase to current main +5. Fetch and rebase to current main: -```bash -git fetch upstream -git rebase upstream/main -``` + ```bash + git fetch upstream + git rebase upstream/main + ``` 6. Add an empty `.py` file in `transformers/src/models/brandnewbert/` named `modeling_tf_brandnewbert.py`. This will be your TensorFlow model file. 7. Push the changes to your account using: -```bash -git add . -git commit -m "initial commit" -git push -u origin add_tf_brand_new_bert -``` + ```bash + git add . + git commit -m "initial commit" + git push -u origin add_tf_brand_new_bert + ``` 8. Once you are satisfied, go to the webpage of your fork on GitHub. Click on “Pull request”. Make sure to add the GitHub handle of some members of the Hugging Face team as reviewers, so that the Hugging Face team gets notified for From 15cd68713d8d027e1033906bf39e999a24b5b5dd Mon Sep 17 00:00:00 2001 From: "Minsub Lee (Matt)" Date: Tue, 2 Apr 2024 23:55:11 +0900 Subject: [PATCH 005/110] Fix `skip_special_tokens` for `Wav2Vec2CTCTokenizer._decode` (#29311) * Fix skip_special_tokens process for Wav2Vec2CTCTokenizer._decode * Fix skip_special_tokens for Wav2Vec2CTCTokenizer._decode * Exclude pad_token filtering since it is used as CTC-blank token * Add small test for skip_special_tokens * Update decoding test for added new token --- .../models/wav2vec2/tokenization_wav2vec2.py | 9 ++++++--- tests/models/wav2vec2/test_tokenization_wav2vec2.py | 13 +++++++++---- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 42b1aa3063..34848a841e 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -113,7 +113,6 @@ class Wav2Vec2CTCTokenizerOutput(ModelOutput): class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): - """ Constructs a Wav2Vec2CTC tokenizer. @@ -420,7 +419,9 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): result = [] for token in filtered_tokens: - if skip_special_tokens and token in self.all_special_ids: + if skip_special_tokens and ( + token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens) + ): continue result.append(token) @@ -881,7 +882,9 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): result = [] for token in filtered_tokens: - if skip_special_tokens and token in self.all_special_ids: + if skip_special_tokens and ( + token in self.all_special_ids or (token != self.pad_token and token in self.all_special_tokens) + ): continue result.append(token) diff --git a/tests/models/wav2vec2/test_tokenization_wav2vec2.py b/tests/models/wav2vec2/test_tokenization_wav2vec2.py index 05109f9736..6c98e0e0c8 100644 --- a/tests/models/wav2vec2/test_tokenization_wav2vec2.py +++ b/tests/models/wav2vec2/test_tokenization_wav2vec2.py @@ -13,6 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. """Tests for the Wav2Vec2 tokenizer.""" + import inspect import json import os @@ -144,8 +145,10 @@ class Wav2Vec2TokenizerTest(unittest.TestCase): [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34], ] batch_tokens = tokenizer.batch_decode(sample_ids) + batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) + self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"]) def test_call(self): # Tests that all call wrap to encode_plus and batch_encode_plus @@ -452,18 +455,20 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase): def test_tokenizer_decode_added_tokens(self): tokenizer = self.tokenizer_class.from_pretrained("facebook/wav2vec2-base-960h") - tokenizer.add_tokens(["!", "?"]) + tokenizer.add_tokens(["!", "?", ""]) tokenizer.add_special_tokens({"cls_token": "$$$"}) # fmt: off sample_ids = [ - [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98, 32, 32, 33, tokenizer.word_delimiter_token_id, 32, 32, 33, 34, 34], - [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34], + [11, 5, 15, tokenizer.pad_token_id, 15, 8, 98, 32, 32, 33, tokenizer.word_delimiter_token_id, 32, 32, 33, 34, 34, 35, 35], + [24, 22, 5, tokenizer.word_delimiter_token_id, 24, 22, 5, 77, tokenizer.pad_token_id, 34, 34, 35, 35], ] # fmt: on batch_tokens = tokenizer.batch_decode(sample_ids) + batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) - self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) + self.assertEqual(batch_tokens, ["HELLO!?!?$$$", "BYE BYE$$$"]) + self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"]) def test_special_characters_in_vocab(self): sent = "ʈʰ æ æ̃ ˧ kʰ" From 9b0a8ea7d1d6226b76cfdc645ce65e21157e2b50 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Tue, 2 Apr 2024 16:59:05 +0200 Subject: [PATCH 006/110] Hard error when ignoring tensors. (#27484) (#29906) * Hard error when ignoring tensors. (#27484) * [WIP] Hard error when ignoring tensors. * Better selection/error when saving a checkpoint. - Find all names we should normally drop (those are in the transformers config) - Find all disjoint tensors (for those we can safely trigger a copy to get rid of the sharing before saving) - Clone those disjoint tensors getting rid of the issue - Find all identical names (those should be declared in the config but we try to find them all anyway.) - For all identical names: - If they are in the config, just ignore them everything is fine - If they are not, warn about them. - For all remainder tensors which are shared yet neither identical NOR disjoint. raise a hard error. * Adding a failing test on `main` that passes here. * We don't need to keep the subfolder logic in this test. * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Add small tests. * Dead variable. * Fixup. * Fixing tied_Weights_keys on generic models. * Fixup + T5 encoder/decoder tying (with different layers) * Code quality. * Dynamic member. * trigger * Fixing encoder name for other types of encoder/decoder combos. * Fix scoping. * Update .github/workflows/self-scheduled.yml Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Fixing the tied_weights after the call. --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: ydshieh --- src/transformers/modeling_utils.py | 155 +++++++++++++++--- src/transformers/models/bert/modeling_bert.py | 3 +- .../modeling_encoder_decoder.py | 11 +- .../models/marian/modeling_marian.py | 8 +- .../models/musicgen/modeling_musicgen.py | 11 +- .../modeling_musicgen_melody.py | 11 +- tests/test_modeling_utils.py | 59 ++++++- 7 files changed, 225 insertions(+), 33 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 19aab73478..fd0afa521a 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -30,7 +30,7 @@ from contextlib import contextmanager from dataclasses import dataclass from functools import partial, wraps from threading import Thread -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union from zipfile import is_zipfile import torch @@ -573,6 +573,79 @@ def set_initialized_submodules(model, state_dict_keys): return not_initialized_submodules +def _end_ptr(tensor: torch.Tensor) -> int: + # extract the end of the pointer if the tensor is a slice of a bigger tensor + if tensor.nelement(): + stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size() + else: + stop = tensor.data_ptr() + return stop + + +def _get_tied_weight_keys(module: nn.Module, prefix=""): + tied_weight_keys = [] + if getattr(module, "_tied_weights_keys", None) is not None: + names = [f"{prefix}.{k}" if prefix else k for k in module._tied_weights_keys] + tied_weight_keys.extend(names) + if getattr(module, "_dynamic_tied_weights_keys", None) is not None: + names = [f"{prefix}.{k}" if prefix else k for k in module._dynamic_tied_weights_keys] + tied_weight_keys.extend(names) + for name, submodule in module.named_children(): + local_prefix = f"{prefix}.{name}" if prefix else name + tied_weight_keys.extend(_get_tied_weight_keys(submodule, prefix=local_prefix)) + return tied_weight_keys + + +def _find_disjoint(tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]) -> Tuple[List[Set[str]], List[str]]: + filtered_tensors = [] + for shared in tensors: + if len(shared) < 2: + filtered_tensors.append(shared) + continue + + areas = [] + for name in shared: + tensor = state_dict[name] + areas.append((tensor.data_ptr(), _end_ptr(tensor), name)) + areas.sort() + + _, last_stop, last_name = areas[0] + filtered_tensors.append({last_name}) + for start, stop, name in areas[1:]: + if start >= last_stop: + filtered_tensors.append({name}) + else: + filtered_tensors[-1].add(name) + last_stop = stop + disjoint_tensors = [] + shared_tensors = [] + for tensors in filtered_tensors: + if len(tensors) == 1: + disjoint_tensors.append(tensors.pop()) + else: + shared_tensors.append(tensors) + return shared_tensors, disjoint_tensors + + +def _find_identical(tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]) -> Tuple[List[Set[str]], Set[str]]: + shared_tensors = [] + identical = [] + for shared in tensors: + if len(shared) < 2: + continue + + areas = collections.defaultdict(set) + for name in shared: + tensor = state_dict[name] + area = (tensor.device, tensor.data_ptr(), _end_ptr(tensor)) + areas[area].add(name) + if len(areas) == 1: + identical.append(shared) + else: + shared_tensors.append(shared) + return shared_tensors, identical + + def _load_state_dict_into_model(model_to_load, state_dict, start_prefix): # Convert old format to new format if needed from a PyTorch state_dict old_keys = [] @@ -1646,15 +1719,24 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if getattr(self.config, "is_encoder_decoder", False) and getattr(self.config, "tie_encoder_decoder", False): if hasattr(self, self.base_model_prefix): self = getattr(self, self.base_model_prefix) - self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix) + tied_weights = self._tie_encoder_decoder_weights( + self.encoder, self.decoder, self.base_model_prefix, "encoder" + ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights for module in self.modules(): if hasattr(module, "_tie_weights"): module._tie_weights() @staticmethod - def _tie_encoder_decoder_weights(encoder: nn.Module, decoder: nn.Module, base_model_prefix: str): + def _tie_encoder_decoder_weights( + encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, base_encoder_name: str + ): uninitialized_encoder_weights: List[str] = [] + tied_weights: List[str] = [] if decoder.__class__ != encoder.__class__: logger.info( f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder" @@ -1665,8 +1747,11 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix decoder_pointer: nn.Module, encoder_pointer: nn.Module, module_name: str, + base_encoder_name: str, uninitialized_encoder_weights: List[str], depth=0, + total_decoder_name="", + total_encoder_name="", ): assert isinstance(decoder_pointer, nn.Module) and isinstance( encoder_pointer, nn.Module @@ -1674,8 +1759,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if hasattr(decoder_pointer, "weight"): assert hasattr(encoder_pointer, "weight") encoder_pointer.weight = decoder_pointer.weight + tied_weights.append(f"{base_encoder_name}{total_encoder_name}.weight") if hasattr(decoder_pointer, "bias"): assert hasattr(encoder_pointer, "bias") + tied_weights.append(f"{base_encoder_name}{total_encoder_name}.bias") encoder_pointer.bias = decoder_pointer.bias return @@ -1713,19 +1800,26 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix decoder_modules[decoder_name], encoder_modules[encoder_name], module_name + "/" + name, + base_encoder_name, uninitialized_encoder_weights, depth=depth + 1, + total_encoder_name=f"{total_encoder_name}.{encoder_name}", + total_decoder_name=f"{total_decoder_name}.{decoder_name}", ) all_encoder_weights.remove(module_name + "/" + encoder_name) uninitialized_encoder_weights += list(all_encoder_weights) # tie weights recursively - tie_encoder_to_decoder_recursively(decoder, encoder, base_model_prefix, uninitialized_encoder_weights) + tie_encoder_to_decoder_recursively( + decoder, encoder, base_model_prefix, base_encoder_name, uninitialized_encoder_weights + ) + if len(uninitialized_encoder_weights) > 0: logger.warning( f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}" ) + return tied_weights def _tie_or_clone_weights(self, output_embeddings, input_embeddings): """Tie or clone module weights depending of whether we are using TorchScript or not""" @@ -2402,34 +2496,49 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # These are all the pointers of shared tensors. shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} - warn_names = set() + error_names = [] + to_delete_names = set() + # Recursively descend to find tied weight keys + _tied_weights_keys = _get_tied_weight_keys(self) for names in shared_ptrs.values(): # Removing the keys which are declared as known duplicates on # load. This allows to make sure the name which is kept is consistent. - if self._tied_weights_keys is not None: + if _tied_weights_keys is not None: found = 0 for name in sorted(names): - matches_pattern = any(re.search(pat, name) for pat in self._tied_weights_keys) + matches_pattern = any(re.search(pat, name) for pat in _tied_weights_keys) if matches_pattern and name in state_dict: found += 1 if found < len(names): - del state_dict[name] + to_delete_names.add(name) + # We are entering a place where the weights and the transformers configuration do NOT match. + shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict) + # Those are actually tensor sharing but disjoint from each other, we can safely clone them + # Reloaded won't have the same property, but it shouldn't matter in any meaningful way. + for name in disjoint_names: + state_dict[name] = state_dict[name].clone() - # When not all duplicates have been cleaned, still remove those keys, but put a clear warning. - # If the link between tensors was done at runtime then `from_pretrained` will not get - # the key back leading to random tensor. A proper warning will be shown - # during reload (if applicable), but since the file is not necessarily compatible with - # the config, better show a proper warning. - found = 0 - for name in names: - if name in state_dict: - found += 1 - if found > 1: - del state_dict[name] - warn_names.add(name) - if len(warn_names) > 0: - logger.warning_once( - f"Removed shared tensor {warn_names} while saving. This should be OK, but check by verifying that you don't receive any warning while reloading", + # When not all duplicates have been cleaned, still remove those keys, but put a clear warning. + # If the link between tensors was done at runtime then `from_pretrained` will not get + # the key back leading to random tensor. A proper warning will be shown + # during reload (if applicable), but since the file is not necessarily compatible with + # the config, better show a proper warning. + shared_names, identical_names = _find_identical(shared_names, state_dict) + # delete tensors that have identical storage + for inames in identical_names: + known = inames.intersection(to_delete_names) + for name in known: + del state_dict[name] + unknown = inames.difference(to_delete_names) + if len(unknown) > 1: + error_names.append(unknown) + + if shared_names: + error_names.append(set(shared_names)) + + if len(error_names) > 0: + raise RuntimeError( + f"The weights trying to be saved contained shared tensors {error_names} that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.", ) # Shard the model if it is too big. diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py index 1b06c37578..262fc79f0d 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py @@ -15,7 +15,6 @@ # limitations under the License. """PyTorch BERT model.""" - import math import os import warnings @@ -1128,7 +1127,7 @@ class BertForPreTraining(BertPreTrainedModel): """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING ) class BertLMHeadModel(BertPreTrainedModel): - _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] + _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] def __init__(self, config): super().__init__(config) diff --git a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py index 1a6adcee1f..16248fee64 100644 --- a/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py +++ b/src/transformers/models/encoder_decoder/modeling_encoder_decoder.py @@ -262,9 +262,16 @@ class EncoderDecoderModel(PreTrainedModel): if self.config.tie_encoder_decoder: # tie encoder and decoder base model decoder_base_model_prefix = self.decoder.base_model_prefix - self._tie_encoder_decoder_weights( - self.encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix + tied_weights = self._tie_encoder_decoder_weights( + self.encoder, + self.decoder._modules[decoder_base_model_prefix], + self.decoder.base_model_prefix, + "encoder", ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights def get_encoder(self): return self.encoder diff --git a/src/transformers/models/marian/modeling_marian.py b/src/transformers/models/marian/modeling_marian.py index 7c39acbcd4..10d7f1b6b2 100755 --- a/src/transformers/models/marian/modeling_marian.py +++ b/src/transformers/models/marian/modeling_marian.py @@ -1343,7 +1343,13 @@ class MarianMTModel(MarianPreTrainedModel): if getattr(self.config, "is_encoder_decoder", False) and getattr(self.config, "tie_encoder_decoder", False): if hasattr(self, self.base_model_prefix): self = getattr(self, self.base_model_prefix) - self._tie_encoder_decoder_weights(self.encoder, self.decoder, self.base_model_prefix) + tied_weights = self._tie_encoder_decoder_weights( + self.encoder, self.decoder, self.base_model_prefix, "encoder" + ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights for module in self.modules(): if hasattr(module, "_tie_weights"): diff --git a/src/transformers/models/musicgen/modeling_musicgen.py b/src/transformers/models/musicgen/modeling_musicgen.py index 2520268f74..7e7c7cb723 100644 --- a/src/transformers/models/musicgen/modeling_musicgen.py +++ b/src/transformers/models/musicgen/modeling_musicgen.py @@ -1891,9 +1891,16 @@ class MusicgenForConditionalGeneration(PreTrainedModel): if self.config.tie_encoder_decoder: # tie text encoder and decoder base model decoder_base_model_prefix = self.decoder.base_model_prefix - self._tie_encoder_decoder_weights( - self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix + tied_weights = self._tie_encoder_decoder_weights( + self.text_encoder, + self.decoder._modules[decoder_base_model_prefix], + self.decoder.base_model_prefix, + "text_encoder", ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights def get_audio_encoder(self): return self.audio_encoder diff --git a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py index 8b0afb2367..0840635f65 100644 --- a/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py +++ b/src/transformers/models/musicgen_melody/modeling_musicgen_melody.py @@ -1810,9 +1810,16 @@ class MusicgenMelodyForConditionalGeneration(PreTrainedModel): if self.config.tie_encoder_decoder: # tie text encoder and decoder base model decoder_base_model_prefix = self.decoder.base_model_prefix - self._tie_encoder_decoder_weights( - self.text_encoder, self.decoder._modules[decoder_base_model_prefix], self.decoder.base_model_prefix + tied_weights = self._tie_encoder_decoder_weights( + self.text_encoder, + self.decoder._modules[decoder_base_model_prefix], + self.decoder.base_model_prefix, + "text_encoder", ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights def get_text_encoder(self): return self.text_encoder diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index 7f82d0dfca..e6f57d68cc 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -101,7 +101,7 @@ if is_torch_available(): _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask, ) - from transformers.modeling_utils import shard_checkpoint + from transformers.modeling_utils import _find_disjoint, _find_identical, shard_checkpoint # Fake pretrained models for tests class BaseModel(PreTrainedModel): @@ -256,6 +256,26 @@ class ModelUtilsTest(TestCasePlus): self.assertTrue(check_models_equal(model, model_loaded)) + def test_model_manually_shared_disjointed_tensors_optimum(self): + config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert") + model = BertModel(config) + + # Let's fuse qkv + attn = model.encoder.layer[0].attention.self + q = attn.query.weight + k = attn.key.weight + v = attn.value.weight + # Force some shared storage + qkv = torch.stack([q, k, v], dim=0) + attn.query.weight = torch.nn.Parameter(qkv[0]) + attn.key.weight = torch.nn.Parameter(qkv[1]) + attn.value.weight = torch.nn.Parameter(qkv[2]) + with tempfile.TemporaryDirectory() as tmp_dir: + model.save_pretrained(tmp_dir) + model_loaded = BertModel.from_pretrained(tmp_dir) + + self.assertTrue(check_models_equal(model, model_loaded)) + def test_model_from_pretrained_subfolder_sharded(self): config = BertConfig.from_pretrained("hf-internal-testing/tiny-random-bert") model = BertModel(config) @@ -2222,3 +2242,40 @@ class Mask4DTestHard(unittest.TestCase): ] self.assertEqual(decoded_0, decoded_1b) + + +@require_torch +class TestTensorSharing(TestCasePlus): + def test_disjoint(self): + main = torch.zeros(10) + a = main[:5] + b = main[5:] + state_dict = {"a": a, "b": b} + + shared_names, disjoint_names = _find_disjoint([{"a", "b"}], state_dict) + self.assertEqual(shared_names, []) + self.assertEqual(disjoint_names, ["a", "b"]) + + a = main[::2] + b = main[1::2] + state_dict = {"a": a, "b": b} + + shared_names, disjoint_names = _find_disjoint([{"a", "b"}], state_dict) + self.assertEqual(shared_names, [{"a", "b"}]) + self.assertEqual(disjoint_names, []) + + def test_identical(self): + a = torch.zeros(10) + b = a + state_dict = {"a": a, "b": b} + + shared_names, identical_names = _find_identical([{"a", "b"}], state_dict) + self.assertEqual(shared_names, []) + self.assertEqual(identical_names, [{"a", "b"}]) + + b = a[:5] + state_dict = {"a": a, "b": b} + + shared_names, identical_names = _find_identical([{"a", "b"}], state_dict) + self.assertEqual(shared_names, [{"a", "b"}]) + self.assertEqual(identical_names, []) From 5080ab12c818d3875858ad37b667c00c6f09f094 Mon Sep 17 00:00:00 2001 From: Joao Gante Date: Tue, 2 Apr 2024 17:18:31 +0100 Subject: [PATCH 007/110] Generate: fix logits processors doctests (#29718) * fix norm * fix logits processors doctests --- src/transformers/generation/logits_process.py | 76 +++++++------------ .../models/whisper/generation_whisper.py | 8 +- 2 files changed, 28 insertions(+), 56 deletions(-) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 5181b59ab5..527bb9bc1e 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -261,8 +261,8 @@ class TemperatureLogitsWarper(LogitsWarper): >>> generate_kwargs = {"max_new_tokens": 10, "do_sample": True, "temperature": 1.0, "num_return_sequences": 2} >>> outputs = model.generate(**inputs, **generate_kwargs) >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)) - ['Hugging Face Company is a joint venture between GEO Group, one of', - 'Hugging Face Company is not an exact science – but what we believe does'] + ['Hugging Face Company is one of these companies that is going to take a', + "Hugging Face Company is a brand created by Brian A. O'Neil"] >>> # However, with temperature close to 0, it approximates greedy decoding strategies (invariant) >>> generate_kwargs["temperature"] = 0.0001 @@ -419,7 +419,7 @@ class TopPLogitsWarper(LogitsWarper): ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed - >>> set_seed(0) + >>> set_seed(1) >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") @@ -428,7 +428,9 @@ class TopPLogitsWarper(LogitsWarper): >>> # With sampling, the output is unexpected -- sometimes too unexpected. >>> outputs = model.generate(**inputs, do_sample=True) >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]) - A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2 + A sequence: 1, 2, 3 | < 4 (left-hand pointer) ; + + >>> # With `top_p` sampling, the output gets restricted to high-probability tokens. >>> # Pro tip: In practice, LLMs use `top_p` in the 0.9-0.95 range. @@ -483,7 +485,7 @@ class TopKLogitsWarper(LogitsWarper): ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed - >>> set_seed(0) + >>> set_seed(1) >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") @@ -492,7 +494,7 @@ class TopKLogitsWarper(LogitsWarper): >>> # With sampling, the output is unexpected -- sometimes too unexpected. >>> outputs = model.generate(**inputs, do_sample=True) >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]) - A sequence: A, B, C, D, G, H, I. A, M + A sequence: A, B, C, D, E — S — O, P — R >>> # With `top_k` sampling, the output gets restricted the k most likely tokens. >>> # Pro tip: In practice, LLMs use `top_k` in the 5-50 range. @@ -624,7 +626,7 @@ class EpsilonLogitsWarper(LogitsWarper): ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed - >>> set_seed(0) + >>> set_seed(1) >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") @@ -633,7 +635,9 @@ class EpsilonLogitsWarper(LogitsWarper): >>> # With sampling, the output is unexpected -- sometimes too unexpected. >>> outputs = model.generate(**inputs, do_sample=True) >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]) - A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2 + A sequence: 1, 2, 3 | < 4 (left-hand pointer) ; + + >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to >>> # Top P sampling, which restricts tokens based on their cumulative probability. @@ -701,7 +705,7 @@ class EtaLogitsWarper(LogitsWarper): ```python >>> from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed - >>> set_seed(0) + >>> set_seed(1) >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2") @@ -710,7 +714,9 @@ class EtaLogitsWarper(LogitsWarper): >>> # With sampling, the output is unexpected -- sometimes too unexpected. >>> outputs = model.generate(**inputs, do_sample=True) >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]) - A sequence: 1, 2, 0, 2, 2. 2, 2, 2, 2 + A sequence: 1, 2, 3 | < 4 (left-hand pointer) ; + + >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff). @@ -1211,16 +1217,16 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor): >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix. >>> # For instance, we can force an entire entity to be generated when its beginning is detected. - >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens + >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0] # 3 tokens >>> def prefix_allowed_tokens_fn(batch_id, input_ids): ... ''' ... Attempts to generate 'Bob Marley' when 'Bob' is detected. ... In this case, `batch_id` is not used, but you can set rules for each batch member. ... ''' ... if input_ids[-1] == entity[0]: - ... return entity[1] + ... return [entity[1].item()] ... elif input_ids[-2] == entity[0] and input_ids[-1] == entity[1]: - ... return entity[2] + ... return [entity[2].item()] ... return list(range(tokenizer.vocab_size)) # If no match, allow all tokens >>> outputs = model.generate(**inputs, max_new_tokens=5, prefix_allowed_tokens_fn=prefix_allowed_tokens_fn) @@ -1618,13 +1624,13 @@ class LogitNormalization(LogitsProcessor, LogitsWarper): >>> # By default, the scores are not normalized -- the sum of their exponentials is NOT a normalized probability >>> # distribution, summing to 1 >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) - >>> print(torch.sum(torch.exp(outputs.scores[-1]))) - tensor(816.3250) + >>> print(torch.allclose(torch.sum(torch.exp(outputs.scores[-1])), torch.Tensor((1.000,)), rtol=1e-4)) + False >>> # Normalizing them may have a positive impact on beam methods, or when using the scores on your application >>> outputs = model.generate(**inputs, renormalize_logits=True, return_dict_in_generate=True, output_scores=True) - >>> print(torch.sum(torch.exp(outputs.scores[-1]))) - tensor(1.0000) + >>> print(torch.allclose(torch.sum(torch.exp(outputs.scores[-1])), torch.Tensor((1.000,)), rtol=1e-4)) + True ``` """ @@ -1655,7 +1661,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> # Whisper has `begin_suppress_tokens` set by default (= `[220, 50256]`). 50256 is the EOS token, so this means >>> # it can't generate and EOS token in the first iteration, but it can in the others. >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) - >>> print(outputs.scores[1][0, 50256]) # 1 (and not 0) is the first freely generated token + >>> print(outputs.scores[0][0, 50256]) tensor(-inf) >>> print(outputs.scores[-1][0, 50256]) # in other places we can see some probability mass for EOS tensor(29.9010) @@ -1664,7 +1670,7 @@ class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor): >>> outputs = model.generate( ... **inputs, return_dict_in_generate=True, output_scores=True, begin_suppress_tokens=None ... ) - >>> print(outputs.scores[1][0, 50256]) + >>> print(outputs.scores[0][0, 50256]) tensor(11.2027) ``` """ @@ -1713,7 +1719,7 @@ class SuppressTokensLogitsProcessor(LogitsProcessor): >>> # If we disable `suppress_tokens`, we can generate it. >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, suppress_tokens=None) >>> print(outputs.scores[1][0, 1]) - tensor(5.7738) + tensor(6.0678) ``` """ @@ -1735,36 +1741,6 @@ class ForceTokensLogitsProcessor(LogitsProcessor): indices that will be forced before generation. The processor will set their log probs to `inf` so that they are sampled at their corresponding index. Originally created for [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper). - - Examples: - ```python - >>> from transformers import AutoProcessor, WhisperForConditionalGeneration - >>> from datasets import load_dataset - - >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") - >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation") - >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt") - - >>> # This Whisper model forces the generation to start with `50362` at the first position by default, i.e. - >>> # `"forced_decoder_ids": [[1, 50362]]`. This means all other tokens are masked out. - >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True) - >>> print( - ... all(outputs.scores[0][0, i] == float("-inf") for i in range(processor.tokenizer.vocab_size) if i != 50362) - ... ) - True - >>> print(outputs.scores[0][0, 50362]) - tensor(0.) - - >>> # If we disable `forced_decoder_ids`, we stop seeing that effect - >>> outputs = model.generate(**inputs, return_dict_in_generate=True, output_scores=True, forced_decoder_ids=None) - >>> print( - ... all(outputs.scores[0][0, i] == float("-inf") for i in range(processor.tokenizer.vocab_size) if i != 50362) - ... ) - False - >>> print(outputs.scores[0][0, 50362]) - tensor(19.3140) - ``` """ def __init__(self, force_token_map: List[List[int]], _has_warned: Optional[bool] = False): diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 0810707bd0..8eca0c48b5 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -12,7 +12,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import math import warnings import zlib @@ -474,11 +473,8 @@ class WhisperGenerationMixin: "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.", FutureWarning, ) - # 1. copy generation config - if generation_config is None: - generation_config = copy.deepcopy(self.generation_config) - else: - generation_config = copy.deepcopy(generation_config) + # 1. prepare generation config + generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs) # 2. set global generate variables input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0] From fce52cefa744a5900fc065aafb2f55d846d1202c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mario=20=C5=A0a=C5=A1ko?= Date: Tue, 2 Apr 2024 19:15:27 +0200 Subject: [PATCH 008/110] Fix `remove_columns` in `text-classification` example (#29351) --- examples/pytorch/text-classification/run_classification.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/pytorch/text-classification/run_classification.py b/examples/pytorch/text-classification/run_classification.py index 0b3d6517c7..982dbf9cc7 100755 --- a/examples/pytorch/text-classification/run_classification.py +++ b/examples/pytorch/text-classification/run_classification.py @@ -422,7 +422,7 @@ def main(): for split in raw_datasets.keys(): for column in data_args.remove_columns.split(","): logger.info(f"removing column {column} from split {split}") - raw_datasets[split].remove_columns(column) + raw_datasets[split] = raw_datasets[split].remove_columns(column) if data_args.label_column_name is not None and data_args.label_column_name != "label": for key in raw_datasets.keys(): From b44df05bc0866f88f06c8c14b392afc197a8c8b6 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 3 Apr 2024 09:25:01 +0200 Subject: [PATCH 009/110] Update `tests/utils/tiny_model_summary.json` (#29941) update Co-authored-by: ydshieh --- tests/utils/tiny_model_summary.json | 44 ----------------------------- 1 file changed, 44 deletions(-) diff --git a/tests/utils/tiny_model_summary.json b/tests/utils/tiny_model_summary.json index 5f2c6c0b4e..7d9140f379 100644 --- a/tests/utils/tiny_model_summary.json +++ b/tests/utils/tiny_model_summary.json @@ -4917,50 +4917,6 @@ ], "sha": "b8c8d479e29e9ee048e2d0b05b001ac835ad8859" }, - "PhiForCausalLM": { - "tokenizer_classes": [ - "CodeGenTokenizer", - "CodeGenTokenizerFast" - ], - "processor_classes": [], - "model_classes": [ - "PhiForCausalLM" - ], - "sha": "3fecc0109a4a3a230e3a5509eaf47a26eba85d79" - }, - "PhiForSequenceClassification": { - "tokenizer_classes": [ - "CodeGenTokenizer", - "CodeGenTokenizerFast" - ], - "processor_classes": [], - "model_classes": [ - "PhiForSequenceClassification" - ], - "sha": "e1c9f8ebf1317516acc1cd6338de71a53e770245" - }, - "PhiForTokenClassification": { - "tokenizer_classes": [ - "CodeGenTokenizer", - "CodeGenTokenizerFast" - ], - "processor_classes": [], - "model_classes": [ - "PhiForTokenClassification" - ], - "sha": "d3a8054903753b5c96c05eaf9877905a116a1d5e" - }, - "PhiModel": { - "tokenizer_classes": [ - "CodeGenTokenizer", - "CodeGenTokenizerFast" - ], - "processor_classes": [], - "model_classes": [ - "PhiModel" - ], - "sha": "99c38d5ce7ace35127d00ed3eeb3561308ea6b21" - }, "Pix2StructForConditionalGeneration": { "tokenizer_classes": [ "T5TokenizerFast" From 81642d2b51de9d5e5aee1768abdc744d90f7f52d Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Wed, 3 Apr 2024 17:11:01 +0800 Subject: [PATCH 010/110] Make EncodecModel.decode ONNX exportable (#29913) * fix encodec onnx export for musicgen * simplification * fix quality * better style --- .../models/encodec/modeling_encodec.py | 36 +++++++++++-------- 1 file changed, 22 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/encodec/modeling_encodec.py b/src/transformers/models/encodec/modeling_encodec.py index bd56661b19..5a299b601b 100644 --- a/src/transformers/models/encodec/modeling_encodec.py +++ b/src/transformers/models/encodec/modeling_encodec.py @@ -111,14 +111,27 @@ class EncodecConv1d(nn.Module): elif self.norm_type == "time_group_norm": self.norm = nn.GroupNorm(1, out_channels) - @staticmethod + kernel_size = self.conv.kernel_size[0] + stride = torch.tensor(self.conv.stride[0], dtype=torch.int64) + dilation = self.conv.dilation[0] + + # Effective kernel size with dilations. + kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64) + + self.register_buffer("stride", stride, persistent=False) + self.register_buffer("kernel_size", kernel_size, persistent=False) + self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False) + def _get_extra_padding_for_conv1d( - hidden_states: torch.Tensor, kernel_size: int, stride: int, padding_total: int = 0 - ) -> int: + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor: """See `pad_for_conv1d`.""" length = hidden_states.shape[-1] - n_frames = (length - kernel_size + padding_total) / stride + 1 - ideal_length = (math.ceil(n_frames) - 1) * stride + (kernel_size - padding_total) + n_frames = (length - self.kernel_size + self.padding_total) / self.stride + 1 + n_frames = torch.ceil(n_frames).to(torch.int64) - 1 + ideal_length = n_frames * self.stride + self.kernel_size - self.padding_total + return ideal_length - length @staticmethod @@ -141,20 +154,15 @@ class EncodecConv1d(nn.Module): return padded[..., :end] def forward(self, hidden_states): - kernel_size = self.conv.kernel_size[0] - stride = self.conv.stride[0] - dilation = self.conv.dilation[0] - kernel_size = (kernel_size - 1) * dilation + 1 # effective kernel size with dilations - padding_total = kernel_size - stride - extra_padding = self._get_extra_padding_for_conv1d(hidden_states, kernel_size, stride, padding_total) + extra_padding = self._get_extra_padding_for_conv1d(hidden_states) if self.causal: # Left padding for causal - hidden_states = self._pad1d(hidden_states, (padding_total, extra_padding), mode=self.pad_mode) + hidden_states = self._pad1d(hidden_states, (self.padding_total, extra_padding), mode=self.pad_mode) else: # Asymmetric padding required for odd strides - padding_right = padding_total // 2 - padding_left = padding_total - padding_right + padding_right = self.padding_total // 2 + padding_left = self.padding_total - padding_right hidden_states = self._pad1d( hidden_states, (padding_left, padding_right + extra_padding), mode=self.pad_mode ) From 17b06e2c6650de162e7954babf6224c1975c2852 Mon Sep 17 00:00:00 2001 From: Miguel Almeida Date: Wed, 3 Apr 2024 14:54:45 +0100 Subject: [PATCH 011/110] Fix Swinv2ForImageClassification NaN output (#29981) To address the issue of NaN logit outputs for certain combinations of the `image_size`, `patch_size` and `depths` configuration parameters, an assertion was made to ensure that the resulting `window_size` field in the model's Self Attention class is greater than 1, preventing divisions by zero in the normalization of `relative_coords_table`. Fix: #28675 --- src/transformers/models/swin2sr/modeling_swin2sr.py | 2 +- src/transformers/models/swinv2/modeling_swinv2.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/swin2sr/modeling_swin2sr.py b/src/transformers/models/swin2sr/modeling_swin2sr.py index 1ef628a144..fb3c0a38f2 100644 --- a/src/transformers/models/swin2sr/modeling_swin2sr.py +++ b/src/transformers/models/swin2sr/modeling_swin2sr.py @@ -298,7 +298,7 @@ class Swin2SRSelfAttention(nn.Module): if pretrained_window_size[0] > 0: relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1 relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1 - else: + elif window_size > 1: relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1 relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1 relative_coords_table *= 8 # normalize to -8, 8 diff --git a/src/transformers/models/swinv2/modeling_swinv2.py b/src/transformers/models/swinv2/modeling_swinv2.py index 16c68ee63f..a83965ede7 100644 --- a/src/transformers/models/swinv2/modeling_swinv2.py +++ b/src/transformers/models/swinv2/modeling_swinv2.py @@ -454,7 +454,7 @@ class Swinv2SelfAttention(nn.Module): if pretrained_window_size[0] > 0: relative_coords_table[:, :, :, 0] /= pretrained_window_size[0] - 1 relative_coords_table[:, :, :, 1] /= pretrained_window_size[1] - 1 - else: + elif window_size > 1: relative_coords_table[:, :, :, 0] /= self.window_size[0] - 1 relative_coords_table[:, :, :, 1] /= self.window_size[1] - 1 relative_coords_table *= 8 # normalize to -8, 8 From 851f253f4d3fa2414451eeaac82b7a9ad6084675 Mon Sep 17 00:00:00 2001 From: Ren Xuancheng Date: Wed, 3 Apr 2024 23:42:43 +0800 Subject: [PATCH 012/110] Fix Qwen2Tokenizer (#29929) qwen2: fixed tokens starting with # in slow tokenizer; add tests Co-authored-by: jklj077 <17811943+jklj077@users.noreply.github.com> --- .../models/qwen2/tokenization_qwen2.py | 4 ++-- tests/models/qwen2/test_tokenization_qwen2.py | 23 +++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/qwen2/tokenization_qwen2.py b/src/transformers/models/qwen2/tokenization_qwen2.py index 22cffcb608..be2685430f 100644 --- a/src/transformers/models/qwen2/tokenization_qwen2.py +++ b/src/transformers/models/qwen2/tokenization_qwen2.py @@ -177,9 +177,9 @@ class Qwen2Tokenizer(PreTrainedTokenizer): self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} bpe_merges = [] with open(merges_file, encoding="utf-8") as merges_handle: - for line in merges_handle: + for i, line in enumerate(merges_handle): line = line.strip() - if not line or line.startswith("#"): + if (i == 0 and line.startswith("#version:")) or not line: continue bpe_merges.append(tuple(line.split())) self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) diff --git a/tests/models/qwen2/test_tokenization_qwen2.py b/tests/models/qwen2/test_tokenization_qwen2.py index 3193141b84..fba44c6dc8 100644 --- a/tests/models/qwen2/test_tokenization_qwen2.py +++ b/tests/models/qwen2/test_tokenization_qwen2.py @@ -59,6 +59,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): ";}", ";}\u010a", "\u00cf\u0135", + "\u0120#", + "##", ] ) @@ -75,6 +77,8 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): "; }", ";} \u010a", "\u00cf \u0135", + "\u0120 #", + "# #", ] self.special_tokens_map = {"eos_token": "<|endoftext|>"} @@ -129,7 +133,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): self.assertListEqual(tokens, bpe_tokens) input_tokens = tokens - input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 268, 267] + input_bpe_tokens = [75, 78, 86, 260, 259, 260, 220, 77, 68, 86, 260, 220, 15, 16, 15, 266, 270, 267] self.assertListEqual(tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens) @unittest.skip("We disable the test of pretokenization as it is not reversible.") @@ -139,6 +143,11 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # the results, by nature, should be different. pass + @unittest.skip("We disable the test of clean up tokenization spaces as it is not applicable.") + def test_clean_up_tokenization_spaces(self): + # it only tests bert-base-uncased and clean_up_tokenization_spaces is not applicable to this tokenizer + pass + def test_nfc_normalization(self): # per https://unicode.org/faq/normalization.html, there are three characters whose normalization forms # under NFC, NFD, NFKC, and NFKD are all different @@ -158,6 +167,16 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_output_string = tokenizer.backend_tokenizer.normalizer.normalize_str(input_string) self.assertEqual(tokenizer_output_string, output_string) + def test_slow_tokenizer_token_with_number_sign(self): + if not self.test_slow_tokenizer: + return + + sequence = " ###" + token_ids = [268, 269] + + tokenizer = self.get_tokenizer() + self.assertListEqual(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sequence)), token_ids) + def test_slow_tokenizer_decode_spaces_between_special_tokens_default(self): # Qwen2Tokenizer changes the default `spaces_between_special_tokens` in `decode` to False if not self.test_slow_tokenizer: @@ -166,7 +185,7 @@ class Qwen2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): # tokenizer has a special token: `"<|endfotext|>"` as eos, but it is not `legacy_added_tokens` # special tokens in `spaces_between_special_tokens` means spaces between `legacy_added_tokens` # that would be `"<|im_start|>"` and `"<|im_end|>"` in Qwen/Qwen2 Models - token_ids = [259, 260, 268, 269, 26] + token_ids = [259, 260, 270, 271, 26] sequence = " lower<|endoftext|><|im_start|>;" sequence_with_space = " lower<|endoftext|> <|im_start|> ;" From bcd42c4af909c92da94fd5884989c56db258f12f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Wed, 3 Apr 2024 17:51:03 +0200 Subject: [PATCH 013/110] Fix `kwargs` handling in `generate_with_fallback` (#29225) * Fix generate_with_fallback **kwargs * Change pop to get * Delete keys from kwargs to prevent overriding generation_config * Revert to passing kwargs by reference, but make a (shallow) copy * dict -> copy.copy * Add test_whisper_longform_multi_batch_beam --- .../models/whisper/generation_whisper.py | 10 +++- tests/models/whisper/test_modeling_whisper.py | 55 +++++++++++++++++++ 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 8eca0c48b5..1e7a56c4cd 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -755,6 +755,8 @@ class WhisperGenerationMixin: do_condition_on_prev_tokens, kwargs, ): + kwargs = copy.copy(kwargs) + # 6.6 Batch generate current chunk seek_sequence_list = [None for _ in range(cur_bsz)] seek_outputs_list = [None for _ in range(cur_bsz)] @@ -769,8 +771,12 @@ class WhisperGenerationMixin: generation_config.do_sample = temperature is not None and temperature > 0.0 generation_config.temperature = temperature if generation_config.do_sample else 1.0 - generation_config.num_beams = kwargs.pop("num_beams", 1) if not generation_config.do_sample else 1 + generation_config.num_beams = kwargs.get("num_beams", 1) if not generation_config.do_sample else 1 + generate_kwargs = copy.copy(kwargs) + for key in ["do_sample", "temperature", "num_beams"]: + if key in generate_kwargs: + del generate_kwargs[key] seek_outputs = super().generate( segment_input, generation_config, @@ -779,7 +785,7 @@ class WhisperGenerationMixin: prefix_allowed_tokens_fn, synced_gpus, decoder_input_ids=decoder_input_ids, - **kwargs, + **generate_kwargs, ) # post-process sequence tokens and outputs to be in list form diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 7ff6387ff2..375d8e7399 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -1533,6 +1533,12 @@ class WhisperModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi @require_torch @require_torchaudio class WhisperModelIntegrationTests(unittest.TestCase): + def setUp(self): + self._unpatched_generation_mixin_generate = transformers.GenerationMixin.generate + + def tearDown(self): + transformers.GenerationMixin.generate = self._unpatched_generation_mixin_generate + @cached_property def default_processor(self): return WhisperProcessor.from_pretrained("openai/whisper-base") @@ -1544,6 +1550,16 @@ class WhisperModelIntegrationTests(unittest.TestCase): return [x["array"] for x in speech_samples] + def _patch_generation_mixin_generate(self, check_args_fn=None): + test = self + + def generate(self, *args, **kwargs): + if check_args_fn is not None: + check_args_fn(*args, **kwargs) + return test._unpatched_generation_mixin_generate(self, *args, **kwargs) + + transformers.GenerationMixin.generate = generate + @slow def test_tiny_logits_librispeech(self): torch_device = "cpu" @@ -2426,6 +2442,45 @@ class WhisperModelIntegrationTests(unittest.TestCase): assert decoded == EXPECTED_TEXT + @slow + def test_whisper_longform_multi_batch_beam(self): + # fmt: off + EXPECTED_TEXT = [' A man said to the universe, Sir, I exist. Sweat-covered Brienne\'s body trickling into the titling cloth that was the only german he wore. The cut on his chest was still dripping blood. The ache of his overstrained eyes, even the soaring arena around him with thousands of spectators, rich trivialities not worth thinking about. His instant panic was followed by a small sharp blow high on his chest. One minute, a voice said, and a time buzzer sounded. A minute is not a very large measure of time, and his body needed every fraction of it. The buzzers were, triggered his muscles into complete relaxation. Oily his heart and lungs worked on at a strong, measured rate. He was in reverie, sliding along the borders of consciousness. The contestants in the 20s needed undisturbed rest. Therefore, nights in the dormitories were as quiet as death. Particularly so, on this last night, when only two of the little cubicles were occupied. The thousands of others standing with dark empty doors. The other voice snapped with a harsh urgency, clearly used to command. I\'m here because the matter is of utmost importance, and brand is the one I must see. Now stand aside. The 20s, he must have drawn his gun because the intruder said quickly, but that away, you\'re being a fool. Out, there was silence then, and still wondering, Breon was once more asleep. Ten seconds, he asked the handler who was needing his aching muscles. A red-haired mountain of a man, with an apparently inexhaustible store of energy. There could be little art in this last and final round of fencing. Just thrust and parry, and victory to the stronger. Every man who entered the 20s had his own training tricks. There appeared to be an immediate association with the death trauma, as if the two were andextricably linked into one. The strength that enables someone in a trance to hold his body stiff and unsupported, except at two points, the head and heels. This is physically impossible when conscious. Others had died before during the 20s, and death during the last round was, in some ways, easier than defeat. Breeding deeply, Breon\'s softly spoke the auto-hypnotic phrases that triggered the process. When the buzzer sounded, he pulled his foil from his second startled grasp and ran forward. Our role looked amazed at the sudden fury of the attack, then smiled. He thought it was the last burst of energy. He knew how close they both were to exhaustion. Breon saw something close to panic on his opponent\'s face when the man finally recognized his error. A wave of despair rolled out from our rogue. Breon sensed it and knew the fifth point was his. Then the powerful twist that\'s rested aside, in and under the guard, Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel. Nor is Mr. Quilter\'s manner less interesting than his matter. He tells us that at this festive season of the year, with Christmas and Rose beef looming before us, similes drawn from eating and its results occur most readily to the mind. He has grave doubts whether Sir Frederick Layton\'s work is really Greek after all, and can discover in it but little of rocky Ithaca. Linnell\'s pictures are a sort of up-gards and atom paintings, and Mason\'s exquisite idles are as national as a jingo poem. Mr. Burkett Foster\'s landscapes smile at one much in the same way that Mr. Carker used to flash his teeth. And Mr. John Collier gives his sitter a cheerful slap in the back, before he says, like a shampooer and a Turkish bath. Next man, it is obviously unnecessary for us to point out how luminous these criticisms are, how delicate in expression. From the general principles of art, Mr. Quilter writes with equal lucidity. Painting he tells us is of a different quality to mathematics, and finish in art is adding more effect. As for etchings, there are two kinds, British and foreign. He laments most bitterly the divorce that has been made between decorative art and what we usually call pictures. The customary appeal to the last judgment and reminds us that in the great days of art Michelangelo was the furnishing upholsterer, near the fire, and the ornaments Fred brought home from India on the mental board. In fact, he is quite severe on Mr. Ruskin for not recognizing that a picture should denote the frailty of man. In remarks was pleasing courtesy and fellas of this grace that many faces are feeling. Only unfortunately his own work never does get good. Mr. Quilter has missed his chance, for he has failed even to make himself the tupper of painting. By Harry Quilter M.A. Because you are sleeping instead of conquering, the lovely rose princess has become a fiddle without a bow, while poor Shaggy sits there, accoing dove. He has gone, and gone for good," answered Polychrom, who had managed to squeeze into the room beside the dragon, and had witnessed the occurrences with much interest. I have remained a prisoner only because I wished to be one. And with this, he stepped forward and burst the stout chains as easily as if they had been threads. A little girl had been asleep, but she heard the wraps and opened the door. The king has fled and disgraced, and your friends are asking for you. I begged Ruggido long ago to send him away, but he would not do so. I also offered to help your brother to escape, but he would not go. He eats and sleeps very steadily, replied the new king. I hope he doesn\'t work too hard, since Shaggy. He doesn\'t work at all. In fact, there is nothing he can do in these dominions, as well as our gnomes, whose numbers are so great that it worries us to keep them all busy. Not exactly, we\'ve turned Calico, whereas my brother now, inquired Shaggy. In the metal forest. Where is that? The metal forest is in the great domed cavern, the largest in all our dominions, replied Calico. Calico hesitated. However, if we look sharp, we may be able to discover one of these secret ways. Oh no, I\'m quite sure he didn\'t. That\'s funny, remarked Betsy thoughtfully. I don\'t believe and knew any magic, or she\'d have worked it before. I do not know, confessed Shaggy. True, a great Calico. Calico went to the big gong and pounded on it, just as we\'re good to be used to do, but no one answered the summons. Having returned to the Royal Cavern, Calico first pounded the gong and then sat in the throne, wearing Regido\'s discarded ruby crown, and holding in his hand to scepter which Regido had so often thrown at his head.'] + # fmt: on + + processor = WhisperProcessor.from_pretrained("openai/whisper-tiny.en") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") + model = model.to(torch_device) + + ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean") + one_audio = np.concatenate([x["array"] for x in ds["validation"]["audio"]], dtype=np.float32) + + input_features = processor(one_audio, return_tensors="pt", truncation=False, padding="longest")[ + "input_features" + ] + input_features = input_features.to(device=torch_device) + + gen_kwargs = { + "return_timestamps": True, + "no_speech_threshold": 0.6, + "temperature": (0.0, 0.2, 0.4, 0.6, 0.8, 1.0), + "num_beams": 2, + "compression_ratio_threshold": 1.35, + "condition_on_prev_tokens": True, + "logprob_threshold": -1.0, + } + + def check_gen_kwargs(inputs, generation_config, *args, **kwargs): + assert generation_config.num_beams == gen_kwargs["num_beams"] + + self._patch_generation_mixin_generate(check_args_fn=check_gen_kwargs) + + torch.manual_seed(0) + result = model.generate(input_features, **gen_kwargs) + decoded = processor.batch_decode(result, skip_special_tokens=True) + + assert decoded == EXPECTED_TEXT + @slow def test_whisper_longform_multi_batch(self): # fmt: off From 240e10626b10574899ecd9a3ddcc47788f289732 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20C=C3=ADfka?= Date: Wed, 3 Apr 2024 17:53:07 +0200 Subject: [PATCH 014/110] Fix probability computation in `WhisperNoSpeechDetection` when recomputing scores (#29248) * Fix is_scores_logprobs in WhisperNoSpeechDetection * Add test_whisper_longform_no_speech_detection * Fix typo --- src/transformers/generation/logits_process.py | 5 +- tests/models/whisper/test_modeling_whisper.py | 53 +++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) diff --git a/src/transformers/generation/logits_process.py b/src/transformers/generation/logits_process.py index 527bb9bc1e..ce91e8a40a 100644 --- a/src/transformers/generation/logits_process.py +++ b/src/transformers/generation/logits_process.py @@ -1930,6 +1930,8 @@ class WhisperNoSpeechDetection(LogitsProcessor): @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING) def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: + is_scores_logprobs = self.is_scores_logprobs + if input_ids.shape[1] == self.begin_index: if self.start_of_trans_offset > 1: with torch.no_grad(): @@ -1937,10 +1939,11 @@ class WhisperNoSpeechDetection(LogitsProcessor): no_speech_index = self.begin_index - self.start_of_trans_offset no_speech_scores = logits[:, no_speech_index] + is_scores_logprobs = False else: no_speech_scores = scores - if self.is_scores_logprobs: + if is_scores_logprobs: probs = no_speech_scores.exp() else: probs = no_speech_scores.float().softmax(dim=-1) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index 375d8e7399..a36bd5f216 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -2670,6 +2670,59 @@ class WhisperModelIntegrationTests(unittest.TestCase): for i in range(num_samples): assert decoded_all[i] == EXPECTED_TEXT[i] + @slow + def test_whisper_longform_no_speech_detection(self): + # fmt: off + EXPECTED_TEXT = [ + " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories. Developing the central headline pawns, definitely maneuvering and also topical night to F6.", + " Folks, I spent a lot of time right over there night after night, actually. Carefully selecting for you the day's newsiest, most aerodynamic headlines, stress testing", + ' Ladies and gentlemen, you know, I spent a lot of time right over there raising the finest Holstein news cattle firmly yet tenderly milking the latest headlines from their joke swollen teats', + ' Folks, you watched this show, you know I spend most of my time right over there, carefully sorting through the days, big stories, and selecting only the most subtle and unblemished ostrich and crocodile news leather, which I then entrust to artisan graduates of the', + " You know, folks, I spent a lot of time crafting for you a bespoke playlist of the day's big stories right over there. meticulously selecting the most topical chakra affirming scented candles, using Feng Shui,", + ' You know, folks, I spend most of my time right over there. Mining the days, biggest, most important stories, collecting the finest, most topical iron or hand hammering it into joke panels, then I craft sheets of bronze and blazing with patterns that tell an epic tale of conquest.', + " Folks, if you watch this show, you know I spend most of my time right over there, carefully blending for you the day's newsiest, most topical flower eggs, milk and butter. And straining into a fine batter to make delicate and informative comedy pancakes, then I glaze them in the juice and zest of the most...", + " Folks, if you watch the show and I hope you do, I spent a lot of time right over there. Tirelessly studying the lineage of the day's most important thoroughbred stories and whole-stiner headlines.", + ] + # fmt: on + + processor = WhisperProcessor.from_pretrained("openai/whisper-tiny") + model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny") + model = model.to(torch_device) + + ds = load_dataset("distil-whisper/meanwhile", "default")["test"] + ds = ds.cast_column("audio", Audio(sampling_rate=16000)) + + num_samples = 8 + + audio = ds[:num_samples]["audio"] + audios = [x["array"] for x in audio] + + # Make sure the second chunk is silent + for audio in audios: + audio[15 * 16000 : 60 * 16000] = 0.0 + + inputs = processor( + audios, return_tensors="pt", truncation=False, padding="longest", return_attention_mask=True + ) + inputs = inputs.to(device=torch_device) + + gen_kwargs = { + "return_timestamps": True, + "no_speech_threshold": 0.2, + "temperature": (0.0,), + "compression_ratio_threshold": 1.35, + "condition_on_prev_tokens": True, + "logprob_threshold": 0.0, # Ignore logprob, use only no-speech prob + "num_beams": 5, + } + + torch.manual_seed(0) + result = model.generate(**inputs, **gen_kwargs) + decoded_all = processor.batch_decode(result, skip_special_tokens=True) + + for i in range(num_samples): + assert decoded_all[i] == EXPECTED_TEXT[i] + def prepare_whisper_encoder_inputs_dict(config, input_features, head_mask=None): if head_mask is None: From cc75f1ac7302d31d30f9420e9d66cc3a11701c47 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 3 Apr 2024 21:00:08 +0500 Subject: [PATCH 015/110] Fix vipllava for generation (#29874) * fix vipllava generation * consistent llava code * revert llava tests changes --- src/transformers/models/llava_next/modeling_llava_next.py | 7 ++++--- src/transformers/models/vipllava/modeling_vipllava.py | 4 ++-- tests/models/llava_next/test_modeling_llava_next.py | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 54ad4d5a50..155d9e3e6a 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -569,10 +569,11 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel): batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) # Get the target length - target_seqlen = first_layer_past_key_value.shape[-1] + 1 + target_length = input_ids.shape[1] + past_length = first_layer_past_key_value.shape[-1] extended_attention_mask = torch.ones( - (attention_mask.shape[0], target_seqlen - attention_mask.shape[1]), + (attention_mask.shape[0], past_length), dtype=attention_mask.dtype, device=attention_mask.device, ) @@ -587,7 +588,7 @@ class LlavaNextForConditionalGeneration(LlavaNextPreTrainedModel): # Zero-out the places where we don't need to attend extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0 - attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1) + attention_mask = torch.cat((extended_attention_mask, attention_mask[:, -target_length:]), dim=1) position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1 outputs = self.language_model( diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index dda9549a4f..1b20353410 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -441,10 +441,10 @@ class VipLlavaForConditionalGeneration(VipLlavaPreTrainedModel): if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1: # Retrieve the first layer to inspect the logits and mask out the hidden states # that are set to 0 - first_layer_past_key_value = past_key_values[0][0][:, 0, :, :] + first_layer_past_key_value = past_key_values[0][0][:, :, :, 0] # Sum all dimensions of head_dim (-1) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941 - batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-1) == 0) + batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0) target_length = input_ids.shape[1] past_length = first_layer_past_key_value.shape[-1] diff --git a/tests/models/llava_next/test_modeling_llava_next.py b/tests/models/llava_next/test_modeling_llava_next.py index 7e4469f306..1c7e320090 100644 --- a/tests/models/llava_next/test_modeling_llava_next.py +++ b/tests/models/llava_next/test_modeling_llava_next.py @@ -423,7 +423,7 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase): output = model(**inputs) expected_slice = torch.tensor( - [[-4.7695, -4.5664, -0.2786], [-10.6172, -10.8906, -2.5234], [-6.7344, -7.2422, -0.6758]], + [[-4.7695, -4.5664, -0.2786], [-10.6250, -10.8906, -2.5254], [-6.7383, -7.2461, -0.6787]], dtype=torch.float32, device=torch_device, ) From 34bfe95af53d7ab24b48b2f2e1a7547bb1f56361 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Wed, 3 Apr 2024 10:05:15 -0700 Subject: [PATCH 016/110] [docs] Fix audio file (#30006) new audio file --- docs/source/en/pipeline_tutorial.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/source/en/pipeline_tutorial.md b/docs/source/en/pipeline_tutorial.md index f41dc05c5e..42ea3b1d5f 100644 --- a/docs/source/en/pipeline_tutorial.md +++ b/docs/source/en/pipeline_tutorial.md @@ -167,9 +167,9 @@ for working on really long audio files (for example, subtitling entire movies or cannot handle on its own: ```python ->>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True) ->>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav") -{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came. I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening... +>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30) +>>> transcriber("https://huggingface.co/datasets/reach-vb/random-audios/resolve/main/ted_60.wav") +{'text': " So in college, I was a government major, which means I had to write a lot of papers. Now, when a normal student writes a paper, they might spread the work out a little like this. So, you know. You get started maybe a little slowly, but you get enough done in the first week that with some heavier days later on, everything gets done and things stay civil. And I would want to do that like that. That would be the plan. I would have it all ready to go, but then actually the paper would come along, and then I would kind of do this. And that would happen every single paper. But then came my 90-page senior thesis, a paper you're supposed to spend a year on. I knew for a paper like that, my normal workflow was not an option, it was way too big a project. So I planned things out and I decided I kind of had to go something like this. This is how the year would go. So I'd start off light and I'd bump it up"} ``` If you can't find a parameter that would really help you out, feel free to [request it](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)! From c10b5dd25ee238ff09ce3c2da8504c4affa50785 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 3 Apr 2024 22:32:01 +0500 Subject: [PATCH 017/110] Superpoint imports fix (#29898) quick fix --- .../models/superpoint/image_processing_superpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/superpoint/image_processing_superpoint.py b/src/transformers/models/superpoint/image_processing_superpoint.py index 8c7e2a7deb..fbbb717570 100644 --- a/src/transformers/models/superpoint/image_processing_superpoint.py +++ b/src/transformers/models/superpoint/image_processing_superpoint.py @@ -17,7 +17,7 @@ from typing import Dict, Optional, Union import numpy as np -from ... import is_vision_available, requires_backends +from ... import is_vision_available from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict from ...image_transforms import resize, to_channel_dimension_format from ...image_utils import ( @@ -29,7 +29,7 @@ from ...image_utils import ( to_numpy_array, valid_images, ) -from ...utils import TensorType, logging +from ...utils import TensorType, logging, requires_backends if is_vision_available(): From 695d82332373e052a03b48f58318d28879c7579f Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 3 Apr 2024 19:34:39 +0200 Subject: [PATCH 018/110] [`Main CIs`] Fix the red cis (#30022) * fix * sort imports --- src/transformers/models/whisper/generation_whisper.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 1e7a56c4cd..4d30a22c76 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -12,6 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import copy import math import warnings import zlib From 863e2562d8d8a535caccb644b15efec663248daa Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Wed, 3 Apr 2024 13:37:52 -0400 Subject: [PATCH 019/110] Make clearer about zero_init requirements (#29879) * Docstring to note about zero init * Check for accelerate * Change conditional return * Tweak * Add new accelerate-specific zero3 check * Fix import * Revert to RTFM * Update src/transformers/modeling_utils.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/training_args.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index e7dcc54deb..694c142437 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -504,6 +504,11 @@ class TrainingArguments: evolve in the future. The value is either the location of DeepSpeed json config file (e.g., `ds_config.json`) or an already loaded json file as a `dict`" + + If enabling any Zero-init, make sure that your model is not initialized until + *after* initializing the `TrainingArguments`, else it will not be applied. + + accelerator_config (`str`, `dict`, or `AcceleratorConfig`, *optional*): Config to be used with the internal `Accelerator` implementation. The value is either a location of accelerator json config file (e.g., `accelerator_config.json`), an already loaded json file as `dict`, From 03732dea60fba1da78c79eb59c443ebf975c2be6 Mon Sep 17 00:00:00 2001 From: Jacky Lee <39754370+jla524@users.noreply.github.com> Date: Wed, 3 Apr 2024 12:54:34 -0700 Subject: [PATCH 020/110] Enable multi-device for efficientnet (#29989) feat: enable mult-idevice for efficientnet --- src/transformers/models/efficientnet/modeling_efficientnet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/efficientnet/modeling_efficientnet.py b/src/transformers/models/efficientnet/modeling_efficientnet.py index 5b7ff534ee..e415d7f1b4 100644 --- a/src/transformers/models/efficientnet/modeling_efficientnet.py +++ b/src/transformers/models/efficientnet/modeling_efficientnet.py @@ -484,6 +484,7 @@ class EfficientNetPreTrainedModel(PreTrainedModel): config_class = EfficientNetConfig base_model_prefix = "efficientnet" main_input_name = "pixel_values" + _no_split_modules = [] def _init_weights(self, module): """Initialize the weights""" From 4e6c5eb0450feeccdfac399805b247f64352bd88 Mon Sep 17 00:00:00 2001 From: byi8220 Date: Thu, 4 Apr 2024 04:29:32 -0400 Subject: [PATCH 021/110] Add a converter from mamba_ssm -> huggingface mamba (#29705) * implement convert_mamba_ssm_checkpoint_to_pytorch * Add test test_model_from_mamba_ssm_conversion * moved convert_ssm_config_to_hf_config to inside mamba_ssm_available check * fix skipif clause * moved skips to inside test since skipif decorator isn't working for some reason * Added validation * removed test * fixup * only compare logits * remove weight rename * Update src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * nits --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- ...convert_mamba_ssm_checkpoint_to_pytorch.py | 153 ++++++++++++++++++ 1 file changed, 153 insertions(+) create mode 100644 src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py diff --git a/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py new file mode 100644 index 0000000000..0cf7dcc0ed --- /dev/null +++ b/src/transformers/models/mamba/convert_mamba_ssm_checkpoint_to_pytorch.py @@ -0,0 +1,153 @@ +# coding=utf-8 +# Copyright 2024 state-spaces/mamba org and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba_ssm` package to be installed.""" + +import argparse +import json +import math +from typing import Tuple + +import torch + +from transformers import AutoTokenizer, MambaConfig, MambaForCausalLM +from transformers.utils import logging +from transformers.utils.import_utils import is_mamba_ssm_available + + +if is_mamba_ssm_available(): + from mamba_ssm.models.config_mamba import MambaConfig as MambaConfigSSM + from mamba_ssm.models.mixer_seq_simple import MambaLMHeadModel + + def convert_ssm_config_to_hf_config(config_ssm: MambaConfigSSM) -> MambaConfig: + """Convert a MambaConfig from mamba_ssm to a MambaConfig from transformers.""" + hf_config = MambaConfig() + # Set config hidden size, num hidden layers, and vocab size directly from the original config + hf_config.hidden_size = config_ssm.d_model + hf_config.intermediate_size = config_ssm.d_model * 2 + hf_config.time_step_rank = math.ceil(config_ssm.d_model / 16) + + hf_config.num_hidden_layers = config_ssm.n_layer + vocab_size = config_ssm.vocab_size + pad_vocab_size_multiple = config_ssm.pad_vocab_size_multiple + if (vocab_size % pad_vocab_size_multiple) != 0: + vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) + hf_config.vocab_size = vocab_size + return hf_config + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def convert_mamba_ssm_checkpoint_to_huggingface_model( + original_state_dict: dict, original_ssm_config_dict: dict +) -> Tuple[MambaForCausalLM, AutoTokenizer]: + if not is_mamba_ssm_available(): + raise ImportError( + "Calling convert_mamba_ssm_checkpoint_to_huggingface_model requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`." + ) + original_ssm_config = MambaConfigSSM(**original_ssm_config_dict) + + # Convert mamba_ssm config to huggingface MambaConfig + hf_config = convert_ssm_config_to_hf_config(original_ssm_config) + + # No weights need to be renamed between the two models. + converted_state_dict = original_state_dict + + # Load reshaped state dict into a huggingface model. + hf_model = MambaForCausalLM(hf_config) + tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neox-20b") + hf_model.load_state_dict(converted_state_dict) + return (hf_model, tokenizer) + + +def validate_converted_model( + original_state_dict: dict, original_ssm_config_dict: dict, hf_model: MambaForCausalLM, tokenizer: AutoTokenizer +) -> None: + """Validate the converted model returns the same output as the original model.""" + torch_device = "cuda" + + original_config = MambaConfigSSM(**original_ssm_config_dict) + original_model = MambaLMHeadModel(original_config).to(torch_device) + original_model.load_state_dict(original_state_dict) + + hf_model = hf_model.to(torch_device) + input_ids = tokenizer("Hey how are you doing?", return_tensors="pt")["input_ids"].to(torch_device) + # Assert model logits are close + with torch.no_grad(): + original_model_logits = original_model(input_ids).logits + hf_model_logits = hf_model(input_ids).logits + if not torch.allclose(original_model_logits, hf_model_logits, atol=1e-3): + raise ValueError("The converted model did not return the same logits as the original model.") + + logger.info("Model conversion validated successfully.") + + +def convert_mamba_checkpoint_file_to_huggingface_model_file( + mamba_checkpoint_path: str, config_json_file: str, output_dir: str +) -> None: + if not is_mamba_ssm_available(): + raise ImportError( + "Calling convert_mamba_checkpoint_file_to_huggingface_model_file requires the mamba_ssm library to be installed. Please install it with `pip install mamba_ssm`." + ) + if not torch.cuda.is_available(): + raise ValueError( + "This script is to be run with a CUDA device, as the original mamba_ssm model does not support cpu." + ) + logger.info(f"Loading model from {mamba_checkpoint_path} based on config from {config_json_file}") + # Load weights and config from paths + original_state_dict = torch.load(mamba_checkpoint_path, map_location="cpu") + with open(config_json_file, "r", encoding="utf-8") as json_file: + original_ssm_config_dict = json.load(json_file) + + # Convert the model + hf_model, tokenizer = convert_mamba_ssm_checkpoint_to_huggingface_model( + original_state_dict, original_ssm_config_dict + ) + + # Validate the conversion + validate_converted_model(original_state_dict, original_ssm_config_dict, hf_model, tokenizer) + + logger.info(f"Model converted successfully. Saving model to {output_dir}") + + # Save new model to pytorch_dump_path + hf_model.save_pretrained(output_dir) + tokenizer.save_pretrained(output_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--mamba_checkpoint_file", + type=str, + required=True, + help="Path to a `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", + ) + parser.add_argument( + "-c", + "--config_json_file", + type=str, + required=True, + help="Path to a `config.json` file corresponding to a MambaConfig of the original mamba_ssm model.", + ) + parser.add_argument( + "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." + ) + args = parser.parse_args() + + convert_mamba_checkpoint_file_to_huggingface_model_file( + args.mamba_checkpoint_file, args.config_json_file, args.output_dir + ) From 75b76a5ea461ace0d141d3415879439ae9bbfc22 Mon Sep 17 00:00:00 2001 From: byi8220 Date: Thu, 4 Apr 2024 05:11:09 -0400 Subject: [PATCH 022/110] [`ProcessingIdefics`] Attention mask bug with padding (#29449) * Defaulted IdeficsProcessor padding to 'longest', removed manual padding * make fixup * Defaulted processor call to padding=False * Add padding to processor call in IdeficsModelIntegrationTest as well * Defaulted IdeficsProcessor padding to 'longest', removed manual padding * make fixup * Defaulted processor call to padding=False * Add padding to processor call in IdeficsModelIntegrationTest as well * redefaulted padding=longest again * fixup/doc --- .../models/idefics/processing_idefics.py | 28 +++++-------- tests/models/idefics/test_modeling_idefics.py | 2 +- .../models/idefics/test_processor_idefics.py | 41 ++++++++++++++++++- 3 files changed, 51 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/idefics/processing_idefics.py b/src/transformers/models/idefics/processing_idefics.py index 590e2475ca..d7fd8c8de6 100644 --- a/src/transformers/models/idefics/processing_idefics.py +++ b/src/transformers/models/idefics/processing_idefics.py @@ -149,7 +149,7 @@ class IdeficsProcessor(ProcessorMixin): def __call__( self, prompts: Union[List[TextInput], List[List[TextInput]]], - padding: Union[bool, str, PaddingStrategy] = False, + padding: Union[bool, str, PaddingStrategy] = "longest", truncation: Union[bool, str, TruncationStrategy] = None, max_length: Optional[int] = None, transform: Callable = None, @@ -165,15 +165,17 @@ class IdeficsProcessor(ProcessorMixin): prompts (`Union[List[TextInput], [List[List[TextInput]]]]`): either a single prompt or a batched list of prompts - see the detailed description immediately after the end of the arguments doc section. - padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `"longest"`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: - - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single sequence if provided). - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum acceptable input length for the model if that argument is not provided. - - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different - lengths). + - `False` or `'do_not_pad'`: No padding. This will raise an error if the input sequences are of different + lengths. + Note: Unlike most processors, which set padding=`False` by default, `IdeficsProcessor` sets `padding="longest"` + by default. See https://github.com/huggingface/transformers/pull/29449#pullrequestreview-1925576061 for why. max_length (`int`, *optional*): Maximum length of the returned list and optionally padding length (see above). truncation (`bool`, *optional*): @@ -333,8 +335,7 @@ class IdeficsProcessor(ProcessorMixin): max_length=max_length, ) all_texts = text_encoding["input_ids"] - - max_seq_len = max(len(x) for x in all_texts) + all_attention_masks = text_encoding["attention_mask"] # max_num_images has to be at least 1 even when there are no images max_num_images = max(len(x) for x in all_images) @@ -344,14 +345,8 @@ class IdeficsProcessor(ProcessorMixin): output_input_ids = [] output_images = [] output_attention_masks = [] - for text, images in zip(all_texts, all_images): - padded_input_ids = [self.tokenizer.pad_token_id] * max_seq_len - unpadded_seq_len = len(text) - start = max_seq_len - unpadded_seq_len - padded_input_ids[start:] = text[:max_seq_len] - - attention_mask = torch.zeros((max_seq_len,), dtype=torch.long) - attention_mask[start:] = 1 + for text, attention_mask, images in zip(all_texts, all_attention_masks, all_images): + padded_input_ids = text image_count = padded_input_ids.count(self.image_token_id) local_max_num_images = min(image_count, max_num_images) @@ -366,8 +361,7 @@ class IdeficsProcessor(ProcessorMixin): output_images.append(padded_image_tensor) output_input_ids.append(torch.tensor(padded_input_ids)) - - output_attention_masks.append(attention_mask) + output_attention_masks.append(torch.tensor(attention_mask)) output_input_ids = torch.stack(output_input_ids) output_images = torch.stack(output_images) diff --git a/tests/models/idefics/test_modeling_idefics.py b/tests/models/idefics/test_modeling_idefics.py index 3059b5a2f5..9f8f177617 100644 --- a/tests/models/idefics/test_modeling_idefics.py +++ b/tests/models/idefics/test_modeling_idefics.py @@ -656,7 +656,7 @@ class IdeficsModelIntegrationTest(TestCasePlus): "HuggingFaceM4/idefics-9b", quantization_config=quantization_config, device_map="auto" ) processor = self.default_processor - inputs = processor(prompts, return_tensors="pt").to(torch_device) + inputs = processor(prompts, return_tensors="pt", padding="longest").to(torch_device) generated_ids = model.generate(**inputs, max_length=100) generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True) diff --git a/tests/models/idefics/test_processor_idefics.py b/tests/models/idefics/test_processor_idefics.py index e02e645946..2e319413d4 100644 --- a/tests/models/idefics/test_processor_idefics.py +++ b/tests/models/idefics/test_processor_idefics.py @@ -124,7 +124,7 @@ class IdeficsProcessorTest(TestCasePlus): prompts = self.prepare_prompts() # test that all prompts succeeded - input_processor = processor(prompts, return_tensors="pt") + input_processor = processor(prompts, return_tensors="pt", padding="longest") for key in self.input_keys: assert torch.is_tensor(input_processor[key]) @@ -151,14 +151,51 @@ class IdeficsProcessorTest(TestCasePlus): " Describe this image.\nAssistant:", " Describe this image.\nAssistant:", ] + predicted_attention_masks = [ + ([1] * 10) + ([0] * 9), + ([1] * 10) + ([0] * 10), + ] prompts = [[prompt] for prompt in self.prepare_prompts()[2]] max_length = processor(prompts, padding="max_length", truncation=True, max_length=20) longest = processor(prompts, padding="longest", truncation=True, max_length=30) + decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) + self.assertEqual(decoded_max_length, predicted_tokens[1]) self.assertEqual(decoded_longest, predicted_tokens[0]) + self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1]) + self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0]) + + def test_tokenizer_left_padding(self): + """Identical to test_tokenizer_padding, but with padding_side not explicitly set.""" + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_tokens = [ + " Describe this image.\nAssistant:", + " Describe this image.\nAssistant:", + ] + predicted_attention_masks = [ + ([0] * 9) + ([1] * 10), + ([0] * 10) + ([1] * 10), + ] + prompts = [[prompt] for prompt in self.prepare_prompts()[2]] + max_length = processor(prompts, padding="max_length", truncation=True, max_length=20) + longest = processor(prompts, padding="longest", truncation=True, max_length=30) + + decoded_max_length = processor.tokenizer.decode(max_length["input_ids"][-1]) + decoded_longest = processor.tokenizer.decode(longest["input_ids"][-1]) + + self.assertEqual(decoded_max_length, predicted_tokens[1]) + self.assertEqual(decoded_longest, predicted_tokens[0]) + + self.assertListEqual(max_length["attention_mask"][-1].tolist(), predicted_attention_masks[1]) + self.assertListEqual(longest["attention_mask"][-1].tolist(), predicted_attention_masks[0]) + def test_model_input_names(self): image_processor = self.get_image_processor() tokenizer = self.get_tokenizer() @@ -166,7 +203,7 @@ class IdeficsProcessorTest(TestCasePlus): processor = IdeficsProcessor(tokenizer=tokenizer, image_processor=image_processor) prompts = self.prepare_prompts() - inputs = processor(prompts) + inputs = processor(prompts, padding="longest") # For now the processor supports only ['pixel_values', 'input_ids', 'attention_mask'] self.assertSetEqual(set(inputs.keys()), set(self.input_keys)) From 517a3e670d8fc11374895e870dd0dd041467c7fe Mon Sep 17 00:00:00 2001 From: Saurabh Dash <111897126+saurabhdash2512@users.noreply.github.com> Date: Thu, 4 Apr 2024 16:16:20 +0530 Subject: [PATCH 023/110] Refactor Cohere Model (#30027) * changes * addressing comments * smol fix --- .../models/cohere/configuration_cohere.py | 4 ++ .../models/cohere/modeling_cohere.py | 62 +++++++++++++------ 2 files changed, 46 insertions(+), 20 deletions(-) diff --git a/src/transformers/models/cohere/configuration_cohere.py b/src/transformers/models/cohere/configuration_cohere.py index a310ad5430..7ceca2b887 100644 --- a/src/transformers/models/cohere/configuration_cohere.py +++ b/src/transformers/models/cohere/configuration_cohere.py @@ -85,6 +85,8 @@ class CohereConfig(PretrainedConfig): Whether to use a bias in the query, key, value and output projection layers during self-attention. attention_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio for the attention probabilities. + use_qk_norm (`bool`, *optional*, defaults to `False`): + Whether to use query-key normalization in the attention ```python >>> from transformers import CohereModel, CohereConfig @@ -123,6 +125,7 @@ class CohereConfig(PretrainedConfig): rope_theta=10000.0, attention_bias=False, attention_dropout=0.0, + use_qk_norm=False, **kwargs, ): self.vocab_size = vocab_size @@ -145,6 +148,7 @@ class CohereConfig(PretrainedConfig): self.rope_theta = rope_theta self.attention_bias = attention_bias self.attention_dropout = attention_dropout + self.use_qk_norm = use_qk_norm super().__init__( pad_token_id=pad_token_id, diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index e949bc1448..41bae6db65 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -76,10 +76,10 @@ def _get_unpad_data(attention_mask): class CohereLayerNorm(nn.Module): - def __init__(self, hidden_size, eps=1e-5, bias=False): + def __init__(self, hidden_size=None, eps=1e-5, bias=False): + """The hidden size can be a tuple or an int. The tuple is used for QKNorm to normalize across head_dim""" super().__init__() self.weight = nn.Parameter(torch.ones(hidden_size)) - self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None self.variance_epsilon = eps def forward(self, hidden_states): @@ -89,8 +89,6 @@ class CohereLayerNorm(nn.Module): variance = (hidden_states - mean).pow(2).mean(-1, keepdim=True) hidden_states = (hidden_states - mean) * torch.rsqrt(variance + self.variance_epsilon) hidden_states = self.weight.to(torch.float32) * hidden_states - if self.bias is not None: - hidden_states = hidden_states + self.bias.to(torch.float32) return hidden_states.to(input_dtype) @@ -122,7 +120,7 @@ class CohereRotaryEmbedding(nn.Module): emb = torch.repeat_interleave(freqs, 2, dim=-1) cos = emb.cos() sin = emb.sin() - return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + return cos, sin def rotate_half(x): @@ -133,7 +131,6 @@ def rotate_half(x): return rot_x -# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): """Applies Rotary Position Embedding to the query and key tensors. @@ -154,11 +151,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): Returns: `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. """ + dtype = q.dtype + q = q.float() + k = k.float() cos = cos.unsqueeze(unsqueeze_dim) sin = sin.unsqueeze(unsqueeze_dim) q_embed = (q * cos) + (rotate_half(q) * sin) k_embed = (k * cos) + (rotate_half(k) * sin) - return q_embed, k_embed + return q_embed.to(dtype=dtype), k_embed.to(dtype=dtype) # Copied from transformers.models.llama.modeling_llama.LlamaMLP Llama->Cohere @@ -192,7 +192,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) -# Copied from transformers.models.llama.modeling_llama.LlamaAttention Llama->Cohere class CohereAttention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" @@ -216,6 +215,7 @@ class CohereAttention(nn.Module): self.max_position_embeddings = config.max_position_embeddings self.rope_theta = config.rope_theta self.is_causal = True + self.use_qk_norm = config.use_qk_norm if (self.head_dim * self.num_heads) != self.hidden_size: raise ValueError( @@ -223,6 +223,13 @@ class CohereAttention(nn.Module): f" and `num_heads`: {self.num_heads})." ) + if self.use_qk_norm: + # When sharding the model using Tensor Parallelism, need to be careful to use n_local_heads + self.q_norm = CohereLayerNorm(hidden_size=(self.num_heads, self.head_dim), eps=config.layer_norm_eps) + self.k_norm = CohereLayerNorm( + hidden_size=(self.num_key_value_heads, self.head_dim), eps=config.layer_norm_eps + ) + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias) self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) @@ -255,8 +262,14 @@ class CohereAttention(nn.Module): key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + if self.use_qk_norm: + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) past_key_value = getattr(self, "past_key_value", past_key_value) @@ -335,11 +348,14 @@ class CohereFlashAttention2(CohereAttention): key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - # Flash attention requires the input to have the shape - # batch_size x seq_length x head_dim x hidden_dim - # therefore we just need to keep the original shape - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + if self.use_qk_norm: + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) cos, sin = self.rotary_emb(value_states, position_ids) @@ -505,7 +521,7 @@ class CohereSdpaAttention(CohereAttention): SDPA API. """ - # Adapted from CohereAttention.forward + # Ignore copy def forward( self, hidden_states: torch.Tensor, @@ -538,8 +554,14 @@ class CohereSdpaAttention(CohereAttention): key_states = self.k_proj(hidden_states) value_states = self.v_proj(hidden_states) - query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim) + if self.use_qk_norm: + query_states = self.q_norm(query_states) + key_states = self.k_norm(key_states) + + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) cos, sin = self.rotary_emb(value_states, position_ids) @@ -599,7 +621,7 @@ class CohereDecoderLayer(nn.Module): self.self_attn = COHERE_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx) self.mlp = CohereMLP(config) - self.input_layernorm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.input_layernorm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) def forward( self, @@ -822,7 +844,7 @@ class CohereModel(CoherePreTrainedModel): self.layers = nn.ModuleList( [CohereDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] ) - self.norm = CohereLayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.norm = CohereLayerNorm(hidden_size=(config.hidden_size), eps=config.layer_norm_eps) self.gradient_checkpointing = False # Initialize weights and apply final processing From 24d787ce9d362dc0e6151395cfd77337c6c8d475 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:06:40 +0200 Subject: [PATCH 024/110] Add `whisper` to `IMPORTANT_MODELS` (#30046) Add whisper Co-authored-by: ydshieh --- utils/tests_fetcher.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index af4785fb6d..6cc22cc5f1 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -91,6 +91,7 @@ IMPORTANT_MODELS = [ "opt", "longformer", "vit", + "whisper", # Pipeline-specific model (to be sure each pipeline has one model in this list) "tapas", "vilt", From 8b52fa6b4209c79e623fc3cc2c4756758c920c3c Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 5 Apr 2024 09:07:41 +0200 Subject: [PATCH 025/110] skip `test_encode_decode_fast_slow_all_tokens` for now (#30044) skip test_encode_decode_fast_slow_all_tokens for now Co-authored-by: ydshieh --- tests/test_tokenization_common.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 4ff17ab557..e98f09d431 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1580,6 +1580,10 @@ class TokenizerTesterMixin: self.assertEqual(len(overflowing_tokens), 2 + stride) self.assertEqual(overflowing_tokens, seq1_tokens[-(2 + stride) :]) + # TODO: FIXME @ArthurZucker + @unittest.skip( + reason="start to fail after # 29473. See https://github.com/huggingface/transformers/pull/29473#pullrequestreview-1945687810" + ) @slow @require_read_token def test_encode_decode_fast_slow_all_tokens(self): From 79d62b2da227b39619afa7f3a86d8aeb95e0f4fa Mon Sep 17 00:00:00 2001 From: "Wang, Yi" Date: Fri, 5 Apr 2024 15:26:44 +0800 Subject: [PATCH 026/110] =?UTF-8?q?if=20output=20is=20tuple=20like=20faceb?= =?UTF-8?q?ook/hf-seamless-m4t-medium,=20waveform=20is=20=E2=80=A6=20(#297?= =?UTF-8?q?22)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * if output is tuple like facebook/hf-seamless-m4t-medium, waveform is the first element Signed-off-by: Wang, Yi * add test and fix batch issue Signed-off-by: Wang, Yi * add dict output support for seamless_m4t Signed-off-by: Wang, Yi --------- Signed-off-by: Wang, Yi --- .../seamless_m4t/modeling_seamless_m4t.py | 1 - src/transformers/pipelines/pt_utils.py | 5 ++++- src/transformers/pipelines/text_to_audio.py | 5 ++++- .../pipelines/test_pipelines_text_to_audio.py | 21 +++++++++++++++++++ 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py index f619dd9e79..c0fe60a643 100755 --- a/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py +++ b/src/transformers/models/seamless_m4t/modeling_seamless_m4t.py @@ -3496,7 +3496,6 @@ class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel): self.device ) kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids - # second generation unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech) output_unit_ids = unit_ids.detach().clone() diff --git a/src/transformers/pipelines/pt_utils.py b/src/transformers/pipelines/pt_utils.py index c39f906f64..652d1eb544 100644 --- a/src/transformers/pipelines/pt_utils.py +++ b/src/transformers/pipelines/pt_utils.py @@ -128,9 +128,12 @@ class PipelineIterator(IterableDataset): # Try to infer the size of the batch if isinstance(processed, torch.Tensor): first_tensor = processed + elif isinstance(processed, tuple): + first_tensor = processed[0] else: key = list(processed.keys())[0] first_tensor = processed[key] + if isinstance(first_tensor, list): observed_batch_size = len(first_tensor) else: @@ -140,7 +143,7 @@ class PipelineIterator(IterableDataset): # elements. self.loader_batch_size = observed_batch_size # Setting internal index to unwrap the batch - self._loader_batch_data = processed + self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed self._loader_batch_index = 0 return self.loader_batch_item() else: diff --git a/src/transformers/pipelines/text_to_audio.py b/src/transformers/pipelines/text_to_audio.py index 58c21cc121..81653f14d6 100644 --- a/src/transformers/pipelines/text_to_audio.py +++ b/src/transformers/pipelines/text_to_audio.py @@ -200,7 +200,10 @@ class TextToAudioPipeline(Pipeline): def postprocess(self, waveform): output_dict = {} - + if isinstance(waveform, dict): + waveform = waveform["waveform"] + elif isinstance(waveform, tuple): + waveform = waveform[0] output_dict["audio"] = waveform.cpu().float().numpy() output_dict["sampling_rate"] = self.sampling_rate diff --git a/tests/pipelines/test_pipelines_text_to_audio.py b/tests/pipelines/test_pipelines_text_to_audio.py index a9f1eccae5..b780d26d79 100644 --- a/tests/pipelines/test_pipelines_text_to_audio.py +++ b/tests/pipelines/test_pipelines_text_to_audio.py @@ -66,6 +66,27 @@ class TextToAudioPipelineTests(unittest.TestCase): audio = [output["audio"] for output in outputs] self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) + @slow + @require_torch + def test_medium_seamless_m4t_pt(self): + speech_generator = pipeline(task="text-to-audio", model="facebook/hf-seamless-m4t-medium", framework="pt") + + for forward_params in [{"tgt_lang": "eng"}, {"return_intermediate_token_ids": True, "tgt_lang": "eng"}]: + outputs = speech_generator("This is a test", forward_params=forward_params) + self.assertEqual({"audio": ANY(np.ndarray), "sampling_rate": 16000}, outputs) + + # test two examples side-by-side + outputs = speech_generator(["This is a test", "This is a second test"], forward_params=forward_params) + audio = [output["audio"] for output in outputs] + self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) + + # test batching + outputs = speech_generator( + ["This is a test", "This is a second test"], forward_params=forward_params, batch_size=2 + ) + audio = [output["audio"] for output in outputs] + self.assertEqual([ANY(np.ndarray), ANY(np.ndarray)], audio) + @slow @require_torch def test_small_bark_pt(self): From d704c0b698659ea5f22b6b6efb614b8580b726b2 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Fri, 5 Apr 2024 00:49:42 -0700 Subject: [PATCH 027/110] Fix mixtral ONNX Exporter Issue. (#29858) * fix mixtral onnx export * fix qwen model --- src/transformers/models/mixtral/modeling_mixtral.py | 8 ++------ src/transformers/models/qwen2_moe/modeling_qwen2_moe.py | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/src/transformers/models/mixtral/modeling_mixtral.py b/src/transformers/models/mixtral/modeling_mixtral.py index e9e801bb71..baa33421d9 100644 --- a/src/transformers/models/mixtral/modeling_mixtral.py +++ b/src/transformers/models/mixtral/modeling_mixtral.py @@ -871,15 +871,11 @@ class MixtralSparseMoeBlock(nn.Module): if top_x.shape[0] == 0: continue - # in torch it is faster to index using lists than torch tensors - top_x_list = top_x.tolist() - idx_list = idx.tolist() - # Index the correct hidden states and compute the expert hidden state for # the current expert. We need to make sure to multiply the output hidden # states by `routing_weights` on the corresponding tokens (top-1 and top-2) - current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim) - current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None] + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] # However `index_add_` only support torch tensors for indexing so we'll use # the `top_x` tensor here. diff --git a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py index e921af9232..cab2ef5ff7 100644 --- a/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py +++ b/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py @@ -843,15 +843,11 @@ class Qwen2MoeSparseMoeBlock(nn.Module): if top_x.shape[0] == 0: continue - # in torch it is faster to index using lists than torch tensors - top_x_list = top_x.tolist() - idx_list = idx.tolist() - # Index the correct hidden states and compute the expert hidden state for # the current expert. We need to make sure to multiply the output hidden # states by `routing_weights` on the corresponding tokens (top-1 and top-2) - current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim) - current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None] + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] # However `index_add_` only support torch tensors for indexing so we'll use # the `top_x` tensor here. From 1ab71364886010c31b20dd8c8bb0c60f8a0681ad Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Fri, 5 Apr 2024 10:10:44 +0200 Subject: [PATCH 028/110] [Trainer] Allow passing image processor (#29896) * Add image processor to trainer * Replace tokenizer=image_processor everywhere --- docs/source/en/tasks/image_classification.md | 4 ++-- docs/source/en/tasks/object_detection.md | 2 +- docs/source/en/tasks/semantic_segmentation.md | 2 +- docs/source/en/tasks/video_classification.md | 2 +- docs/source/es/tasks/image_classification.md | 2 +- docs/source/ja/tasks/image_classification.md | 4 ++-- docs/source/ja/tasks/object_detection.md | 2 +- docs/source/ja/tasks/semantic_segmentation.md | 2 +- .../ja/tasks/sequence_classification.md | 2 +- docs/source/ja/tasks/video_classification.md | 2 +- docs/source/ko/tasks/image_classification.md | 4 ++-- docs/source/ko/tasks/object_detection.md | 2 +- docs/source/ko/tasks/semantic_segmentation.md | 2 +- docs/source/ko/tasks/video_classification.md | 2 +- .../run_image_classification.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../run_image_classification.py | 2 +- src/transformers/trainer.py | 19 ++++++++++++++++--- src/transformers/trainer_callback.py | 6 +++++- 21 files changed, 43 insertions(+), 26 deletions(-) diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index 30c517f3be..f54b4ed025 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -322,7 +322,7 @@ At this point, only three steps remain: ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -418,7 +418,7 @@ and use the [PushToHubCallback](../main_classes/keras_callbacks#transformers.Pus >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... tokenizer=image_processor, +... image_processor=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 2513591f54..56d46e4aa5 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -384,7 +384,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]: ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... tokenizer=image_processor, +... image_processor=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index e99499bbbb..ba40ccba1e 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -642,7 +642,7 @@ and use the [`PushToHubCallback`] to upload the model: ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md index 38bdceba41..a0f0a695f7 100644 --- a/docs/source/en/tasks/video_classification.md +++ b/docs/source/en/tasks/video_classification.md @@ -407,7 +407,7 @@ Then you just pass all of this along with the datasets to `Trainer`: ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md index f09730caf6..4a572d8169 100644 --- a/docs/source/es/tasks/image_classification.md +++ b/docs/source/es/tasks/image_classification.md @@ -160,7 +160,7 @@ Al llegar a este punto, solo quedan tres pasos: ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... tokenizer=image_processor, +... image_processor=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md index f8d8d0d552..fc57cf4dfb 100644 --- a/docs/source/ja/tasks/image_classification.md +++ b/docs/source/ja/tasks/image_classification.md @@ -328,7 +328,7 @@ food["test"].set_transform(preprocess_val) ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -426,7 +426,7 @@ Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Data >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... tokenizer=image_processor, +... image_processor=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md index 389e7bdf2f..e90cb4645a 100644 --- a/docs/source/ja/tasks/object_detection.md +++ b/docs/source/ja/tasks/object_detection.md @@ -376,7 +376,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... tokenizer=image_processor, +... image_processor=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md index 2816688b4e..bc4c8fdc10 100644 --- a/docs/source/ja/tasks/semantic_segmentation.md +++ b/docs/source/ja/tasks/semantic_segmentation.md @@ -434,7 +434,7 @@ TensorFlow でモデルを微調整するには、次の手順に従います。 ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/ja/tasks/sequence_classification.md b/docs/source/ja/tasks/sequence_classification.md index 6673cfe9e5..767d5e03cd 100644 --- a/docs/source/ja/tasks/sequence_classification.md +++ b/docs/source/ja/tasks/sequence_classification.md @@ -436,7 +436,7 @@ TensorFlow でモデルを微調整するには、次の手順に従います。 ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md index e0c3836194..b0b5139028 100644 --- a/docs/source/ja/tasks/video_classification.md +++ b/docs/source/ja/tasks/video_classification.md @@ -414,7 +414,7 @@ def compute_metrics(eval_pred): ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md index 031e01ea5c..055100d4c0 100644 --- a/docs/source/ko/tasks/image_classification.md +++ b/docs/source/ko/tasks/image_classification.md @@ -321,7 +321,7 @@ food["test"].set_transform(preprocess_val) ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -417,7 +417,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요: >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... tokenizer=image_processor, +... image_processor=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md index 0076bba6f8..1eeada9a50 100644 --- a/docs/source/ko/tasks/object_detection.md +++ b/docs/source/ko/tasks/object_detection.md @@ -366,7 +366,7 @@ DatasetDict({ ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... tokenizer=image_processor, +... image_processor=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md index 4b6109d692..4c23b2ad80 100644 --- a/docs/source/ko/tasks/semantic_segmentation.md +++ b/docs/source/ko/tasks/semantic_segmentation.md @@ -424,7 +424,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요: ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md index 01dbb0757b..4d13f9ac61 100644 --- a/docs/source/ko/tasks/video_classification.md +++ b/docs/source/ko/tasks/video_classification.md @@ -411,7 +411,7 @@ def compute_metrics(eval_pred): ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... tokenizer=image_processor, +... image_processor=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index ff01600cb3..1c952e5601 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -411,7 +411,7 @@ def main(): train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, - tokenizer=image_processor, + image_processor=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index a23e41df61..0f098caf02 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -369,7 +369,7 @@ def main(): args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, - tokenizer=image_processor, + image_processor=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index 625a96f14e..e1afeece12 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -458,7 +458,7 @@ def main(): args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, - tokenizer=image_processor, + image_processor=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 957b78b9b5..8324531ccb 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -510,7 +510,7 @@ def main(): train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, - tokenizer=image_processor, + image_processor=image_processor, data_collator=default_data_collator, ) diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index 3e2b43bca1..ab2de73a3b 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -552,7 +552,7 @@ def main(): output_dir=training_args.output_dir, hub_model_id=push_to_hub_model_id, hub_token=training_args.push_to_hub_token, - tokenizer=image_processor, + image_processor=image_processor, **model_card_kwargs, ) ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 6bcf4796f8..436165b0e3 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -59,6 +59,7 @@ from .configuration_utils import PretrainedConfig from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .debug_utils import DebugOption, DebugUnderflowOverflow from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend +from .image_processing_utils import BaseImageProcessor from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available from .integrations.tpu import tpu_spmd_dataloader from .modelcard import TrainingSummary @@ -303,6 +304,9 @@ class Trainer: The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. + image_processor ([`BaseImageProcessor`], *optional*): + The image processor used to preprocess the data. If provided, it will be saved along the model to make it easier + to rerun an interrupted training or reuse the fine-tuned model. model_init (`Callable[[], PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start from a new instance of the model as given by this function. @@ -357,6 +361,7 @@ class Trainer: train_dataset: Optional[Union[Dataset, IterableDataset]] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, + image_processor: Optional["BaseImageProcessor"] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, @@ -485,11 +490,12 @@ class Trainer: ): self.place_model_on_device = False - default_collator = default_data_collator if tokenizer is None else DataCollatorWithPadding(tokenizer) + default_collator = DataCollatorWithPadding(tokenizer) if tokenizer is not None else default_data_collator self.data_collator = data_collator if data_collator is not None else default_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer + self.image_processor = image_processor # Bnb Quantized models doesn't support `.to` operation. if ( @@ -541,7 +547,7 @@ class Trainer: default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks self.callback_handler = CallbackHandler( - callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler + callbacks, self.model, self.tokenizer, self.image_processor, self.optimizer, self.lr_scheduler ) self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) @@ -3276,6 +3282,8 @@ class Trainer: ) if self.tokenizer is not None and self.args.should_save: self.tokenizer.save_pretrained(output_dir) + if self.image_processor is not None and self.args.should_save: + self.image_processor.save_pretrained(output_dir) # We moved the model from TPU -> CPU for saving the weights. # Now we should move it back to subsequent compute still works. @@ -3313,6 +3321,8 @@ class Trainer: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) + if self.image_processor is not None: + self.image_processor.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) @@ -4009,6 +4019,9 @@ class Trainer: # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) + # Same for the image processor + if self.image_processor is not None: + self.image_processor.save_pretrained(output_dir) # Same for the training arguments torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) @@ -4056,7 +4069,7 @@ class Trainer: def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str: """ - Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`. + Upload `self.model` and `self.tokenizer` or `self.image_processor` to the 🤗 model hub on the repo `self.args.hub_model_id`. Parameters: commit_message (`str`, *optional*, defaults to `"End of training"`): diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 1e3b0e587a..a9cb6eca59 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -189,6 +189,8 @@ class TrainerCallback: The model being trained. tokenizer ([`PreTrainedTokenizer`]): The tokenizer used for encoding the data. + image_processor ([`BaseImageProcessor`]): + The image processor used for encoding the images. optimizer (`torch.optim.Optimizer`): The optimizer used for the training steps. lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`): @@ -307,12 +309,13 @@ class TrainerCallback: class CallbackHandler(TrainerCallback): """Internal class that just calls the list of callbacks in order.""" - def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler): + def __init__(self, callbacks, model, tokenizer, image_processor, optimizer, lr_scheduler): self.callbacks = [] for cb in callbacks: self.add_callback(cb) self.model = model self.tokenizer = tokenizer + self.image_processor = image_processor self.optimizer = optimizer self.lr_scheduler = lr_scheduler self.train_dataloader = None @@ -417,6 +420,7 @@ class CallbackHandler(TrainerCallback): control, model=self.model, tokenizer=self.tokenizer, + image_processor=self.image_processor, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler, train_dataloader=self.train_dataloader, From 4207a4076d1bef446402edeba7297a58d4389157 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Fri, 5 Apr 2024 13:11:28 +0200 Subject: [PATCH 029/110] [bnb] Fix offload test (#30039) fix bnb test --- tests/quantization/bnb/test_mixed_int8.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/quantization/bnb/test_mixed_int8.py b/tests/quantization/bnb/test_mixed_int8.py index cf6e156034..227273d278 100644 --- a/tests/quantization/bnb/test_mixed_int8.py +++ b/tests/quantization/bnb/test_mixed_int8.py @@ -100,6 +100,8 @@ class BaseMixedInt8Test(unittest.TestCase): # Expected values on a A10 EXPECTED_OUTPUTS.add("Hello my name is John.\nI am a friend of your father.\n") MAX_NEW_TOKENS = 10 + # Expected values with offload + EXPECTED_OUTPUTS.add("Hello my name is John and I am a professional photographer based in") def setUp(self): # Models and tokenizer From de11d0bdf0286f64616ea0d4b5778c41151a2d22 Mon Sep 17 00:00:00 2001 From: miRx923 <94078303+miRx923@users.noreply.github.com> Date: Fri, 5 Apr 2024 14:04:50 +0200 Subject: [PATCH 030/110] Update quantizer_bnb_4bit.py: In the ValueError string there should be "....you need to set `llm_int8_enable_fp32_cpu_offload=True`...." instead of "`load_in_8bit_fp32_cpu_offload=True`". (#30013) * Update quantizer_bnb_4bit.py There is an mistake in ValueError on line 86 of quantizer_bnb_4bit.py. In the error string there should be "....you need to set `llm_int8_enable_fp32_cpu_offload=True`...." instead of "load_in_8bit_fp32_cpu_offload=True". I think you updated the BitsAndBytesConfig() arguments, but forgot to change the ValueError in quantizer_bnb_4bit.py. * Update quantizer_bnb_4bit.py Changed ValueError string "...you need to set load_in_8bit_fp32_cpu_offload=True..." to "....you need to set llm_int8_enable_fp32_cpu_offload=True...." --- src/transformers/quantizers/quantizer_bnb_4bit.py | 2 +- src/transformers/quantizers/quantizer_bnb_8bit.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/quantizers/quantizer_bnb_4bit.py b/src/transformers/quantizers/quantizer_bnb_4bit.py index b98eebba18..112cfd644f 100644 --- a/src/transformers/quantizers/quantizer_bnb_4bit.py +++ b/src/transformers/quantizers/quantizer_bnb_4bit.py @@ -87,7 +87,7 @@ class Bnb4BitHfQuantizer(HfQuantizer): """ Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules - in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to + in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. diff --git a/src/transformers/quantizers/quantizer_bnb_8bit.py b/src/transformers/quantizers/quantizer_bnb_8bit.py index f4249b69d0..8ad60a03e2 100644 --- a/src/transformers/quantizers/quantizer_bnb_8bit.py +++ b/src/transformers/quantizers/quantizer_bnb_8bit.py @@ -87,7 +87,7 @@ class Bnb8BitHfQuantizer(HfQuantizer): """ Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules - in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom `device_map` to + in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. From 48795317a21e9128d3ca877657acd855e9ba8477 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 5 Apr 2024 14:30:36 +0200 Subject: [PATCH 031/110] [test fetcher] Always include the directly related test files (#30050) * fix * fix --------- Co-authored-by: ydshieh --- utils/tests_fetcher.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index 6cc22cc5f1..e54e6d0de4 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -958,10 +958,25 @@ def create_module_to_test_map( model_tests = {Path(t).parts[2] for t in tests if t.startswith("tests/models/")} return len(model_tests) > num_model_tests // 2 - def filter_tests(tests): - return [t for t in tests if not t.startswith("tests/models/") or Path(t).parts[2] in IMPORTANT_MODELS] + # for each module (if specified in the argument `module`) of the form `models/my_model` (i.e. starting with it), + # we always keep the tests (those are already in the argument `tests`) which are in `tests/models/my_model`. + # This is to avoid them being excluded when a module has many impacted tests: the directly related test files should + # always be included! + def filter_tests(tests, module=""): + return [ + t + for t in tests + if not t.startswith("tests/models/") + or Path(t).parts[2] in IMPORTANT_MODELS + # at this point, `t` is of the form `tests/models/my_model`, and we check if `models/my_model` + # (i.e. `parts[1:3]`) is in `module`. + or "/".join(Path(t).parts[1:3]) in module + ] - return {module: (filter_tests(tests) if has_many_models(tests) else tests) for module, tests in test_map.items()} + return { + module: (filter_tests(tests, module=module) if has_many_models(tests) else tests) + for module, tests in test_map.items() + } def check_imports_all_exist(): From 17cd7a9d28e12ed3f1623d1193f0b3a2ad4aca92 Mon Sep 17 00:00:00 2001 From: Michael Benayoun Date: Fri, 5 Apr 2024 15:14:09 +0200 Subject: [PATCH 032/110] Fix `torch.fx` symbolic tracing for LLama (#30047) * [WIP] fix fx * [WIP] fix fx * [WIP] fix fx * [WIP] fix fx * [WIP] fix fx * Apply changes to other models --- src/transformers/models/cohere/modeling_cohere.py | 8 +++++--- src/transformers/models/gemma/modeling_gemma.py | 8 +++++--- src/transformers/models/llama/modeling_llama.py | 8 +++++--- src/transformers/utils/fx.py | 9 ++++++--- tests/models/cohere/test_modeling_cohere.py | 4 +--- tests/models/llama/test_modeling_llama.py | 4 +--- 6 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/transformers/models/cohere/modeling_cohere.py b/src/transformers/models/cohere/modeling_cohere.py index 41bae6db65..95a7d76827 100644 --- a/src/transformers/models/cohere/modeling_cohere.py +++ b/src/transformers/models/cohere/modeling_cohere.py @@ -908,7 +908,9 @@ class CohereModel(CoherePreTrainedModel): if position_ids is None: position_ids = cache_position.unsqueeze(0) - causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_seen_tokens + inputs_embeds.shape[1] + ) # embed positions hidden_states = inputs_embeds @@ -976,7 +978,7 @@ class CohereModel(CoherePreTrainedModel): # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 - def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + def _update_causal_mask(self, attention_mask, input_tensor, cache_position, current_length): if self.config._attn_implementation == "flash_attention_2": if attention_mask is not None and 0.0 in attention_mask: return attention_mask @@ -989,7 +991,7 @@ class CohereModel(CoherePreTrainedModel): target_length = self.config.max_position_embeddings else: # dynamic cache target_length = ( - attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1 + attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1 ) causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) diff --git a/src/transformers/models/gemma/modeling_gemma.py b/src/transformers/models/gemma/modeling_gemma.py index 2d93c43425..c8b9b11c55 100644 --- a/src/transformers/models/gemma/modeling_gemma.py +++ b/src/transformers/models/gemma/modeling_gemma.py @@ -888,7 +888,9 @@ class GemmaModel(GemmaPreTrainedModel): if position_ids is None: position_ids = cache_position.unsqueeze(0) - causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_seen_tokens + inputs_embeds.shape[1] + ) # embed positions hidden_states = inputs_embeds @@ -962,7 +964,7 @@ class GemmaModel(GemmaPreTrainedModel): # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 - def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + def _update_causal_mask(self, attention_mask, input_tensor, cache_position, current_length): if self.config._attn_implementation == "flash_attention_2": if attention_mask is not None and 0.0 in attention_mask: return attention_mask @@ -975,7 +977,7 @@ class GemmaModel(GemmaPreTrainedModel): target_length = self.config.max_position_embeddings else: # dynamic cache target_length = ( - attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1 + attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1 ) causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) diff --git a/src/transformers/models/llama/modeling_llama.py b/src/transformers/models/llama/modeling_llama.py index 8d0baf63c7..e1afb61be0 100644 --- a/src/transformers/models/llama/modeling_llama.py +++ b/src/transformers/models/llama/modeling_llama.py @@ -987,7 +987,9 @@ class LlamaModel(LlamaPreTrainedModel): if position_ids is None: position_ids = cache_position.unsqueeze(0) - causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_seen_tokens + inputs_embeds.shape[1] + ) # embed positions hidden_states = inputs_embeds @@ -1055,7 +1057,7 @@ class LlamaModel(LlamaPreTrainedModel): # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 - def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + def _update_causal_mask(self, attention_mask, input_tensor, cache_position, current_length): if self.config._attn_implementation == "flash_attention_2": if attention_mask is not None and 0.0 in attention_mask: return attention_mask @@ -1068,7 +1070,7 @@ class LlamaModel(LlamaPreTrainedModel): target_length = self.config.max_position_embeddings else: # dynamic cache target_length = ( - attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else cache_position[-1] + 1 + attention_mask.shape[-1] if isinstance(attention_mask, torch.Tensor) else current_length + 1 ) causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) diff --git a/src/transformers/utils/fx.py b/src/transformers/utils/fx.py index fd2b1512b2..df0aba8d5d 100755 --- a/src/transformers/utils/fx.py +++ b/src/transformers/utils/fx.py @@ -260,11 +260,14 @@ def torch_arange(*args, **kwargs): def torch_full(*args, **kwargs): args = list(args) - if isinstance(args[1], torch.Tensor) and args[1].device == torch.device("meta"): - args[1] = 1 # Any value. + # We set the fill value to 1 as its value is not important as long as it's not a tensor on the `meta` device. + if len(args) > 1: + args[1] = 1 + else: + kwargs["fill_value"] = 1 kwargs_without_device = dict(kwargs) kwargs_without_device.pop("device", None) - return torch.full(*args, **kwargs_without_device) + return torch.full(*args, **kwargs_without_device, device="meta") def torch_cat(tensors, dim=None, axis=None, *, out=None): diff --git a/tests/models/cohere/test_modeling_cohere.py b/tests/models/cohere/test_modeling_cohere.py index 883eb92e8b..3e86ffe9d9 100644 --- a/tests/models/cohere/test_modeling_cohere.py +++ b/tests/models/cohere/test_modeling_cohere.py @@ -283,9 +283,7 @@ class CohereModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMix ) test_headmasking = False test_pruning = False - fx_compatible = ( - False # FIXME @michaelbenayoun or @fxmarty from https://github.com/huggingface/transformers/pull/29753 - ) + fx_compatible = True # Need to use `0.8` instead of `0.9` for `test_cpu_offload` # This is because we are hitting edge cases with the causal_mask buffer diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index e0a3990bd8..0fb4087dba 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -305,9 +305,7 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi ) test_headmasking = False test_pruning = False - fx_compatible = ( - False # FIXME @michaelbenayoun or @fxmarty from https://github.com/huggingface/transformers/pull/29753 - ) + fx_compatible = True # Need to use `0.8` instead of `0.9` for `test_cpu_offload` # This is because we are hitting edge cases with the causal_mask buffer From b17b54d3dd3093815d375c2a78d5b3a0661c3c8d Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 5 Apr 2024 15:49:51 +0200 Subject: [PATCH 033/110] Refactor daily CI workflow (#30012) * separate jobs * separate jobs * use channel name directly instead of ID * use channel name directly instead of ID * use channel name directly instead of ID --------- Co-authored-by: ydshieh --- .github/workflows/self-scheduled-caller.yml | 59 ++++++ .github/workflows/self-scheduled.yml | 193 ++++++++------------ .github/workflows/slack-report.yml | 64 +++++++ utils/notification_service.py | 88 ++++++--- 4 files changed, 267 insertions(+), 137 deletions(-) create mode 100644 .github/workflows/self-scheduled-caller.yml create mode 100644 .github/workflows/slack-report.yml diff --git a/.github/workflows/self-scheduled-caller.yml b/.github/workflows/self-scheduled-caller.yml new file mode 100644 index 0000000000..59b992bcd2 --- /dev/null +++ b/.github/workflows/self-scheduled-caller.yml @@ -0,0 +1,59 @@ +name: Self-hosted runner (scheduled) + + +on: + repository_dispatch: + schedule: + - cron: "17 2 * * *" + push: + branches: + - run_scheduled_ci* + +jobs: + model-ci: + name: Model CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_tests_gpu + slack_report_channel: "#transformers-ci-daily-models" + secrets: inherit + + torch-pipeline: + name: Torch pipeline CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_pipelines_torch_gpu + slack_report_channel: "#transformers-ci-daily-pipeline-torch" + secrets: inherit + + tf-pipeline: + name: TF pipeline CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_pipelines_tf_gpu + slack_report_channel: "#transformers-ci-daily-pipeline-tf" + secrets: inherit + + example-ci: + name: Example CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_examples_gpu + slack_report_channel: "#transformers-ci-daily-examples" + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_all_tests_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-daily-deepspeed" + secrets: inherit + + quantization-ci: + name: Quantization CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_tests_quantization_torch_gpu + slack_report_channel: "#transformers-ci-daily-quantization" + secrets: inherit diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 465c00dd13..3e563e94e1 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -7,12 +7,14 @@ name: Self-hosted runner (scheduled) # `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` on: - repository_dispatch: - schedule: - - cron: "17 2 * * *" - push: - branches: - - run_scheduled_ci* + workflow_call: + inputs: + job: + required: true + type: string + slack_report_channel: + required: true + type: string env: HF_HOME: /mnt/cache @@ -31,6 +33,7 @@ env: jobs: setup: + if: ${{ inputs.job == 'run_tests_gpu' }} name: Setup strategy: matrix: @@ -71,6 +74,7 @@ jobs: nvidia-smi run_tests_gpu: + if: ${{ inputs.job == 'run_tests_gpu' }} name: " " needs: setup strategy: @@ -85,58 +89,8 @@ jobs: slice_id: ${{ matrix.slice_id }} secrets: inherit - run_examples_gpu: - name: Examples directory - strategy: - fail-fast: false - matrix: - machine_type: [single-gpu] - runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - container: - image: huggingface/transformers-all-latest-gpu - options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup - steps: - - name: Update clone - working-directory: /transformers - run: git fetch && git checkout ${{ github.sha }} - - - name: Reinstall transformers in edit mode (remove the one installed during docker image build) - working-directory: /transformers - run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - - - name: NVIDIA-SMI - run: | - nvidia-smi - - - name: Environment - working-directory: /transformers - run: | - python3 utils/print_env.py - - - name: Show installed libraries and their versions - working-directory: /transformers - run: pip freeze - - - name: Run examples tests on GPU - working-directory: /transformers - run: | - pip install -r examples/pytorch/_tests_requirements.txt - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt - - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: ${{ matrix.machine_type }}_run_examples_gpu - path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu - run_pipelines_torch_gpu: + if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} name: PyTorch pipelines strategy: fail-fast: false @@ -146,7 +100,6 @@ jobs: container: image: huggingface/transformers-pytorch-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup steps: - name: Update clone working-directory: /transformers @@ -187,6 +140,7 @@ jobs: path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu run_pipelines_tf_gpu: + if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} name: TensorFlow pipelines strategy: fail-fast: false @@ -196,7 +150,6 @@ jobs: container: image: huggingface/transformers-tensorflow-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup steps: - name: Update clone working-directory: /transformers @@ -237,14 +190,65 @@ jobs: name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu + run_examples_gpu: + if: ${{ inputs.job == 'run_examples_gpu' }} + name: Examples directory + strategy: + fail-fast: false + matrix: + machine_type: [single-gpu] + runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_examples_gpu examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_examples_gpu/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.machine_type }}_run_examples_gpu + path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu + run_all_tests_torch_cuda_extensions_gpu: + if: ${{ inputs.job == 'run_all_tests_torch_cuda_extensions_gpu' }} name: Torch CUDA extension tests strategy: fail-fast: false matrix: machine_type: [single-gpu, multi-gpu] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] - needs: setup container: image: huggingface/transformers-pytorch-deepspeed-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ @@ -298,6 +302,7 @@ jobs: path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu run_tests_quantization_torch_gpu: + if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} name: Quantization tests strategy: fail-fast: false @@ -307,7 +312,6 @@ jobs: container: image: huggingface/transformers-quantization-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ - needs: setup steps: - name: Update clone working-directory: /transformers @@ -348,18 +352,11 @@ jobs: path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu run_extract_warnings: + # Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic. + if: ${{ always() && inputs.job == 'run_tests_gpu' }} name: Extract warnings in CI artifacts runs-on: ubuntu-22.04 - if: always() - needs: [ - setup, - run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, - run_pipelines_torch_gpu, - run_all_tests_torch_cuda_extensions_gpu, - run_tests_quantization_torch_gpu, - ] + needs: [setup, run_tests_gpu] steps: - name: Checkout transformers uses: actions/checkout@v3 @@ -396,52 +393,24 @@ jobs: path: warnings_in_ci/selected_warnings.json send_results: - name: Send results to webhook - runs-on: ubuntu-22.04 - if: always() + name: Slack Report needs: [ setup, run_tests_gpu, - run_examples_gpu, - run_pipelines_tf_gpu, run_pipelines_torch_gpu, + run_pipelines_tf_gpu, + run_examples_gpu, run_all_tests_torch_cuda_extensions_gpu, run_tests_quantization_torch_gpu, run_extract_warnings ] - steps: - - name: Preliminary job status - shell: bash - # For the meaning of these environment variables, see the job `Setup` - run: | - echo "Setup status: ${{ needs.setup.result }}" - - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 - - name: Send message to Slack - env: - CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} - CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} - ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} - CI_EVENT: scheduled - CI_SHA: ${{ github.sha }} - CI_WORKFLOW_REF: ${{ github.workflow_ref }} - SETUP_STATUS: ${{ needs.setup.result }} - # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change - # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. - run: | - sudo apt-get install -y curl - pip install slack_sdk - pip show slack_sdk - python utils/notification_service.py "${{ needs.setup.outputs.folder_slices }}" - - # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - - name: Failure table artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: prev_ci_results - path: prev_ci_results + if: ${{ always() }} + uses: ./.github/workflows/slack-report.yml + with: + job: ${{ inputs.job }} + # This would be `skipped` if `setup` is skipped. + setup_status: ${{ needs.setup.result }} + slack_report_channel: ${{ inputs.slack_report_channel }} + # This would be an empty string if `setup` is skipped. + folder_slices: ${{ needs.setup.outputs.folder_slices }} + secrets: inherit \ No newline at end of file diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml new file mode 100644 index 0000000000..0e964e8596 --- /dev/null +++ b/.github/workflows/slack-report.yml @@ -0,0 +1,64 @@ +name: CI slack report + +on: + workflow_call: + inputs: + job: + required: true + type: string + slack_report_channel: + required: true + type: string + setup_status: + required: true + type: string + folder_slices: + required: true + type: string + + +jobs: + send_results: + name: Send results to webhook + runs-on: ubuntu-22.04 + if: always() + steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Setup status: ${{ inputs.setup_status }}" + + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + CI_WORKFLOW_REF: ${{ github.workflow_ref }} + CI_TEST_JOB: ${{ inputs.job }} + SETUP_STATUS: ${{ inputs.setup_status }} + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + # For a job that doesn't depend on (i.e. `needs`) `setup`, the value for `inputs.folder_slices` would be an + # empty string, and the called script still get one argument (which is the emtpy string). + run: | + sudo apt-get install -y curl + pip install slack_sdk + pip show slack_sdk + python utils/notification_service.py "${{ inputs.folder_slices }}" + + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. + - name: Failure table artifacts + # Only the model testing job is concerned for this step + if: ${{ inputs.job == 'run_tests_gpu' }} + uses: actions/upload-artifact@v3 + with: + name: prev_ci_results + path: prev_ci_results diff --git a/utils/notification_service.py b/utils/notification_service.py index d29e6994a2..5378348ee9 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -227,10 +227,13 @@ class Message: button_text = "Check warnings (Link not found)" # Use the workflow run link job_link = f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}" - if "Extract warnings in CI artifacts" in github_actions_job_links: - button_text = "Check warnings" - # Use the actual job link - job_link = f"{github_actions_job_links['Extract warnings in CI artifacts']}" + + for job in github_actions_jobs: + if "Extract warnings in CI artifacts" in job["name"] and job["conclusion"] == "success": + button_text = "Check warnings" + # Use the actual job link + job_link = job["html_url"] + break huggingface_hub_warnings = [x for x in self.selected_warnings if "huggingface_hub" in x] text = f"There are {len(self.selected_warnings)} warnings being selected." @@ -573,7 +576,7 @@ class Message: print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=SLACK_REPORT_CHANNEL_ID, text=text, blocks=payload, ) @@ -586,7 +589,7 @@ class Message: text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." self.thread_ts = client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=SLACK_REPORT_CHANNEL_ID, blocks=payload, text=text, ) @@ -712,7 +715,7 @@ class Message: print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=SLACK_REPORT_CHANNEL_ID, text=f"Results for {job}", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -735,7 +738,7 @@ class Message: print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=SLACK_REPORT_CHANNEL_ID, text=f"Results for {job}", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -749,7 +752,7 @@ class Message: print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_REPORT_CHANNEL_ID"], + channel=SLACK_REPORT_CHANNEL_ID, text="Results for new failures", blocks=blocks, thread_ts=self.thread_ts["ts"], @@ -852,6 +855,8 @@ def prepare_reports(title, header, reports, to_truncate=True): if __name__ == "__main__": + SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"] + # runner_status = os.environ.get("RUNNER_STATUS") # runner_env_status = os.environ.get("RUNNER_ENV_STATUS") setup_status = os.environ.get("SETUP_STATUS") @@ -861,7 +866,8 @@ if __name__ == "__main__": # Let's keep the lines regardig runners' status (we might be able to use them again in the future) runner_not_available = False runner_failed = False - setup_failed = True if setup_status is not None and setup_status != "success" else False + # Some jobs don't depend (`needs`) on the job `setup`: in this case, the status of the job `setup` is `skipped`. + setup_failed = False if setup_status in ["skipped", "success"] else True org = "huggingface" repo = "transformers" @@ -929,14 +935,21 @@ if __name__ == "__main__": Message.error_out(title, ci_title, runner_not_available, runner_failed, setup_failed) exit(0) - arguments = sys.argv[1:][0] - try: - folder_slices = ast.literal_eval(arguments) - # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). - models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] - except SyntaxError: - Message.error_out(title, ci_title) - raise ValueError("Errored out.") + # sys.argv[0] is always `utils/notification_service.py`. + arguments = sys.argv[1:] + # In our usage in `.github/workflows/slack-report.yml`, we always pass an argument when calling this script. + # The argument could be an empty string `""` if a job doesn't depend on the job `setup`. + if arguments[0] == "": + models = [] + else: + model_list_as_str = arguments[0] + try: + folder_slices = ast.literal_eval(model_list_as_str) + # Need to change from elements like `models/bert` to `models_bert` (the ones used as artifact names). + models = [x.replace("models/", "models_") for folders in folder_slices for x in folders] + except Exception: + Message.error_out(title, ci_title) + raise ValueError("Errored out.") github_actions_jobs = get_jobs( workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"] @@ -1039,9 +1052,9 @@ if __name__ == "__main__": # Additional runs additional_files = { - "Examples directory": "run_examples_gpu", "PyTorch pipelines": "run_tests_torch_pipeline_gpu", "TensorFlow pipelines": "run_tests_tf_pipeline_gpu", + "Examples directory": "run_examples_gpu", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", "Quantization tests": "run_tests_quantization_torch_gpu", } @@ -1056,6 +1069,24 @@ if __name__ == "__main__": elif ci_event.startswith("Push CI (AMD)"): additional_files = {} + # A map associating the job names (specified by `inputs.job` in a workflow file) with the keys of + # `additional_files`. This is used to remove some entries in `additional_files` that are not concerned by a + # specific job. See below. + job_to_test_map = { + "run_pipelines_torch_gpu": "PyTorch pipelines", + "run_pipelines_tf_gpu": "TensorFlow pipelines", + "run_examples_gpu": "Examples directory", + "run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests", + "run_tests_quantization_torch_gpu": "Quantization tests", + } + + # Remove some entries in `additional_files` if they are not concerned. + test_name = None + job_name = os.getenv("CI_TEST_JOB") + if job_name in job_to_test_map: + test_name = job_to_test_map[job_name] + additional_files = {k: v for k, v in additional_files.items() if k == test_name} + additional_results = { key: { "failed": {"unclassified": 0, "single": 0, "multi": 0}, @@ -1103,17 +1134,24 @@ if __name__ == "__main__": {"line": line, "trace": stacktraces.pop(0)} ) + # Let's only check the warning for the model testing job. Currently, the job `run_extract_warnings` is only run + # when `inputs.job` (in the workflow file) is `run_tests_gpu`. The reason is: otherwise we need to save several + # artifacts with different names which complicates the logic for an insignificant part of the CI workflow reporting. selected_warnings = [] - if "warnings_in_ci" in available_artifacts: - directory = available_artifacts["warnings_in_ci"].paths[0]["path"] - with open(os.path.join(directory, "selected_warnings.json")) as fp: - selected_warnings = json.load(fp) + if job_name == "run_tests_gpu": + if "warnings_in_ci" in available_artifacts: + directory = available_artifacts["warnings_in_ci"].paths[0]["path"] + with open(os.path.join(directory, "selected_warnings.json")) as fp: + selected_warnings = json.load(fp) if not os.path.isdir(os.path.join(os.getcwd(), "prev_ci_results")): os.makedirs(os.path.join(os.getcwd(), "prev_ci_results")) - with open("prev_ci_results/model_results.json", "w", encoding="UTF-8") as fp: - json.dump(model_results, fp, indent=4, ensure_ascii=False) + # Only the model testing job is concerned: this condition is to avoid other jobs to upload the empty list as + # results. + if job_name == "run_tests_gpu": + with open("prev_ci_results/model_results.json", "w", encoding="UTF-8") as fp: + json.dump(model_results, fp, indent=4, ensure_ascii=False) prev_ci_artifacts = None target_workflow = "huggingface/transformers/.github/workflows/self-scheduled.yml@refs/heads/main" From d9fa13ce62711c5c4840d1faf0473bfb9237a508 Mon Sep 17 00:00:00 2001 From: Kola Date: Fri, 5 Apr 2024 15:19:54 +0100 Subject: [PATCH 034/110] Add docstrings and types for MambaCache (#30023) * Add docstrings and types for MambaCache * Update src/transformers/models/mamba/modeling_mamba.py * Update src/transformers/models/mamba/modeling_mamba.py * Update src/transformers/models/mamba/modeling_mamba.py * make fixup * import copy in generation_whisper * ruff * Revert "make fixup" This reverts commit c4fedd6f60e3b0f11974a11433bc130478829a5c. --- .../models/mamba/modeling_mamba.py | 22 ++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/mamba/modeling_mamba.py b/src/transformers/models/mamba/modeling_mamba.py index 4c3cfaa48d..8f19c36126 100644 --- a/src/transformers/models/mamba/modeling_mamba.py +++ b/src/transformers/models/mamba/modeling_mamba.py @@ -61,7 +61,23 @@ from ..deprecated._archive_maps import MAMBA_PRETRAINED_MODEL_ARCHIVE_LIST # no class MambaCache: - def __init__(self, config, batch_size, dtype=torch.float16, device=None): + """ + Arguments: + config: MambaConfig + batch_size: int + dtype: torch.dtype + device: torch.device + + Attributes: + seqlen_offset: int + dtype: torch.dtype + conv_states: Dict[int, torch.Tensor] # layer_idx -> [batch_size, intermediate_size, conv_kernel_size] + ssm_states: Dict[int, torch.Tensor] # layer_idx -> [batch_size, intermediate_size, ssm_state_size] + """ + + def __init__( + self, config: MambaConfig, batch_size: int, dtype: torch.dtype = torch.float16, device: Optional[str] = None + ): self.seqlen_offset = 0 self.dtype = dtype intermediate_size = config.intermediate_size @@ -86,13 +102,13 @@ class MambaMixer(nn.Module): and is why Mamba is called **selective** state spaces) """ - def __init__(self, config, layer_idx): + def __init__(self, config: MambaConfig, layer_idx: int): super().__init__() self.hidden_size = config.hidden_size self.ssm_state_size = config.state_size self.conv_kernel_size = config.conv_kernel self.intermediate_size = config.intermediate_size - self.time_step_rank = config.time_step_rank + self.time_step_rank = int(config.time_step_rank) self.layer_idx = layer_idx self.use_conv_bias = config.use_conv_bias self.conv1d = nn.Conv1d( From 9b5a6450d481b0f02834684ffd8b3ba4cbbd6fe0 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 5 Apr 2024 17:49:46 +0200 Subject: [PATCH 035/110] Fix auto tests (#30067) * fix * fix --------- Co-authored-by: ydshieh --- tests/models/auto/test_modeling_auto.py | 4 ++-- tests/models/auto/test_modeling_tf_auto.py | 8 ++++---- tests/models/auto/test_modeling_tf_pytorch.py | 4 ++-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tests/models/auto/test_modeling_auto.py b/tests/models/auto/test_modeling_auto.py index a8e42d77f9..363028c7f2 100644 --- a/tests/models/auto/test_modeling_auto.py +++ b/tests/models/auto/test_modeling_auto.py @@ -141,7 +141,7 @@ class AutoModelTest(unittest.TestCase): @slow def test_model_for_causal_lm(self): - model_name = "google-bert/bert-base-uncased" + model_name = "openai-community/gpt2" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, GPT2Config) @@ -165,7 +165,7 @@ class AutoModelTest(unittest.TestCase): @slow def test_model_for_encoder_decoder_lm(self): - model_name = "google-bert/bert-base-uncased" + model_name = "google-t5/t5-base" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, T5Config) diff --git a/tests/models/auto/test_modeling_tf_auto.py b/tests/models/auto/test_modeling_tf_auto.py index 53a07b1970..a63d9fbe4c 100644 --- a/tests/models/auto/test_modeling_tf_auto.py +++ b/tests/models/auto/test_modeling_tf_auto.py @@ -118,15 +118,15 @@ class TFAutoModelTest(unittest.TestCase): model_name = "openai-community/gpt2" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) - self.assertIsInstance(config, BertConfig) + self.assertIsInstance(config, GPT2Config) model = TFAutoModelWithLMHead.from_pretrained(model_name) self.assertIsNotNone(model) - self.assertIsInstance(model, TFBertForMaskedLM) + self.assertIsInstance(model, TFGPT2LMHeadModel) @slow def test_model_for_masked_lm(self): - model_name = "openai-community/gpt2" + model_name = "google-bert/bert-base-uncased" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, BertConfig) @@ -138,7 +138,7 @@ class TFAutoModelTest(unittest.TestCase): @slow def test_model_for_encoder_decoder_lm(self): - model_name = "openai-community/gpt2" + model_name = "google-t5/t5-base" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, T5Config) diff --git a/tests/models/auto/test_modeling_tf_pytorch.py b/tests/models/auto/test_modeling_tf_pytorch.py index 5b9036cbf1..2c59c906db 100644 --- a/tests/models/auto/test_modeling_tf_pytorch.py +++ b/tests/models/auto/test_modeling_tf_pytorch.py @@ -103,7 +103,7 @@ class TFPTAutoModelTest(unittest.TestCase): @slow def test_model_for_causal_lm(self): - model_name = "google-bert/bert-base-uncased" + model_name = "openai-community/gpt2" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, GPT2Config) @@ -156,7 +156,7 @@ class TFPTAutoModelTest(unittest.TestCase): @slow def test_model_for_encoder_decoder_lm(self): - model_name = "google-bert/bert-base-uncased" + model_name = "google-t5/t5-base" config = AutoConfig.from_pretrained(model_name) self.assertIsNotNone(config) self.assertIsInstance(config, T5Config) From 76fa17c1663a0efeca7208c20579833365584889 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Fri, 5 Apr 2024 21:28:58 +0500 Subject: [PATCH 036/110] Fix whisper kwargs and generation config (#30018) * clean-up whisper kwargs * failing test --- .../models/whisper/generation_whisper.py | 79 ++++--------------- 1 file changed, 15 insertions(+), 64 deletions(-) diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 4d30a22c76..bd88b67bc6 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -511,7 +511,6 @@ class WhisperGenerationMixin: self._set_language_and_task( language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config ) - self._set_token_ids(generation_config=generation_config, config=self.config, kwargs=kwargs) self._set_num_frames( return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs ) @@ -546,13 +545,13 @@ class WhisperGenerationMixin: logits_processor=logits_processor, begin_index=begin_index, # begin index is index of first generated decoder token is_shortform=is_shortform, - num_beams=kwargs.get("num_beams", 1), + num_beams=generation_config.num_beams, ) # 5. If we're in shortform mode, simple generate the whole input at once and return the output if is_shortform: if temperature is not None: - kwargs["temperature"] = temperature + generation_config.temperature = temperature decoder_input_ids = kwargs.pop("decoder_input_ids", None) if decoder_input_ids is None: @@ -564,8 +563,8 @@ class WhisperGenerationMixin: [prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1 ) - if kwargs.get("max_new_tokens", 0) + decoder_input_ids.shape[-1] > self.config.max_target_positions: - max_new_tokens = kwargs.get("max_new_tokens", 0) + max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0 + if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions: raise ValueError( f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` " f"is {max_new_tokens}. Thus, the combined length of " @@ -666,11 +665,10 @@ class WhisperGenerationMixin: ) # 6.6 set max new tokens or max length - kwargs = self._set_max_new_tokens_and_length( + self._set_max_new_tokens_and_length( config=self.config, decoder_input_ids=decoder_input_ids, generation_config=generation_config, - kwargs=kwargs, ) # 6.7 Set current `begin_index` for all logit processors @@ -770,9 +768,9 @@ class WhisperGenerationMixin: for fallback_idx, temperature in enumerate(temperatures): generation_config.do_sample = temperature is not None and temperature > 0.0 - generation_config.temperature = temperature if generation_config.do_sample else 1.0 - generation_config.num_beams = kwargs.get("num_beams", 1) if not generation_config.do_sample else 1 + if generation_config.do_sample: + generation_config.num_beams = 1 generate_kwargs = copy.copy(kwargs) for key in ["do_sample", "temperature", "num_beams"]: @@ -1095,11 +1093,8 @@ class WhisperGenerationMixin: task = getattr(generation_config, "task", None) language = getattr(generation_config, "language", None) - if kwargs.get("forced_decoder_ids", None) is not None: - forced_decoder_ids = kwargs["forced_decoder_ids"] - elif hasattr(generation_config, "forced_decoder_ids") and generation_config.forced_decoder_ids is not None: - forced_decoder_ids = generation_config.forced_decoder_ids - + forced_decoder_ids = generation_config.forced_decoder_ids + if forced_decoder_ids is not None: if language is None and task is None and forced_decoder_ids[0][1] is None: logger.warning_once( "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English." @@ -1107,8 +1102,6 @@ class WhisperGenerationMixin: ) elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None: forced_decoder_ids = config.forced_decoder_ids - else: - forced_decoder_ids = None if forced_decoder_ids is not None and task is not None: logger.info( @@ -1288,21 +1281,6 @@ class WhisperGenerationMixin: "Passing `decoder_input_ids` is deprecated. Consider passing `prompt_ids` instead.", ) - @staticmethod - def _set_token_ids(generation_config, config, kwargs): - eos_token_id = kwargs.pop("eos_token_id", None) - decoder_start_token_id = kwargs.pop("decoder_start_token_id", None) - - eos_token_id = eos_token_id if eos_token_id is not None else generation_config.eos_token_id - decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else generation_config.decoder_start_token_id - ) - - generation_config.eos_token_id = eos_token_id if eos_token_id is not None else config.eos_token_id - generation_config.decoder_start_token_id = ( - decoder_start_token_id if decoder_start_token_id is not None else config.decoder_start_token_id - ) - @staticmethod def _set_num_frames(return_token_timestamps, generation_config, kwargs): if return_token_timestamps: @@ -1313,7 +1291,6 @@ class WhisperGenerationMixin: "Model generation config has no `alignment_heads`, token-level timestamps not available. " "See https://gist.github.com/hollance/42e32852f24243b748ae6bc1f985b13a on how to add this property to the generation config." ) - generation_config.num_frames = kwargs.pop("num_frames", None) @staticmethod @@ -1517,47 +1494,21 @@ class WhisperGenerationMixin: return decoder_input_ids, kwargs @staticmethod - def _set_max_new_tokens_and_length(config, decoder_input_ids, generation_config, kwargs): + def _set_max_new_tokens_and_length(config, decoder_input_ids, generation_config): num_initial_tokens = min(config.max_target_positions // 2 - 1, decoder_input_ids.shape[-1] - 1) - passed_max_length = kwargs.pop("max_length", None) - passed_max_new_tokens = kwargs.pop("max_new_tokens", None) - max_length_config = getattr(generation_config, "max_length", None) - max_new_tokens_config = getattr(generation_config, "max_new_tokens", None) - - max_new_tokens = None - max_length = None - # Make sure we don't get larger than `max_length` - if passed_max_length is not None and passed_max_new_tokens is None: - max_length = min(passed_max_length + num_initial_tokens, config.max_target_positions) - logger.info( - f"Increase max_length from {passed_max_length} to {max_length} since input is conditioned on previous segment." - ) - elif max_length_config is not None and passed_max_new_tokens is None and max_new_tokens_config is None: + if generation_config.max_length is not None and generation_config.max_new_tokens is None: max_length = min(generation_config.max_length + num_initial_tokens, config.max_target_positions) logger.info( - f"Increase max_length from {max_length_config} to {max_length} since input is conditioned on previous segment." + f"Increase max_length from {generation_config.max_length} to {max_length} since input is conditioned on previous segment." ) elif ( - passed_max_new_tokens is not None - and passed_max_new_tokens + decoder_input_ids.shape[-1] > config.max_target_positions + generation_config.max_new_tokens is not None + and generation_config.max_new_tokens + decoder_input_ids.shape[-1] > config.max_target_positions ): max_new_tokens = config.max_target_positions - decoder_input_ids.shape[-1] - elif ( - passed_max_new_tokens is None - and max_new_tokens_config is not None - and max_new_tokens_config + decoder_input_ids.shape[-1] > config.max_target_positions - ): - max_new_tokens = config.max_target_positions - decoder_input_ids.shape[-1] - - if max_new_tokens is not None: - kwargs["max_new_tokens"] = max_new_tokens - - if max_length is not None: - kwargs["max_length"] = max_length - - return kwargs + generation_config.max_new_tokens = max_new_tokens @staticmethod def _retrieve_compression_ratio(tokens, vocab_size): From 1fc34aa6663744d1ef53674b560b0f2ca061e1db Mon Sep 17 00:00:00 2001 From: Cylis Date: Mon, 8 Apr 2024 15:44:05 +0800 Subject: [PATCH 037/110] doc: Correct spelling mistake (#30107) --- docs/source/zh/autoclass_tutorial.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/zh/autoclass_tutorial.md b/docs/source/zh/autoclass_tutorial.md index 7205aa0872..f056f12d78 100644 --- a/docs/source/zh/autoclass_tutorial.md +++ b/docs/source/zh/autoclass_tutorial.md @@ -83,7 +83,7 @@ rendered properly in your Markdown viewer. ## AutoProcessor -多模态任务需要一种`processor`,将两种类型的预处理工具结合起来。例如,[LayoutLMV2](model_doc/layoutlmv2)模型需要一个`image processo`来处理图像和一个`tokenizer`来处理文本;`processor`将两者结合起来。 +多模态任务需要一种`processor`,将两种类型的预处理工具结合起来。例如,[LayoutLMV2](model_doc/layoutlmv2)模型需要一个`image processor`来处理图像和一个`tokenizer`来处理文本;`processor`将两者结合起来。 使用[`AutoProcessor.from_pretrained`]加载`processor`: From 1ed93be48a8f0fc77fce4dacced1976fa5d55713 Mon Sep 17 00:00:00 2001 From: vaibhavagg303 <89418214+vaibhavagg303@users.noreply.github.com> Date: Mon, 8 Apr 2024 14:06:25 +0530 Subject: [PATCH 038/110] [Whisper] Computing features on GPU in batch mode for whisper feature extractor. (#29900) * add _torch_extract_fbank_features_batch function in feature_extractor_whisper * reformat feature_extraction_whisper.py file * handle batching in single function * add gpu test & doc * add batch test & device in each __call__ * add device arg in doc string --------- Co-authored-by: vaibhav.aggarwal --- .../whisper/feature_extraction_whisper.py | 62 +++++++++++++------ .../test_feature_extraction_whisper.py | 38 +++++++++++- 2 files changed, 81 insertions(+), 19 deletions(-) diff --git a/src/transformers/models/whisper/feature_extraction_whisper.py b/src/transformers/models/whisper/feature_extraction_whisper.py index 42104c3293..508e85b91f 100644 --- a/src/transformers/models/whisper/feature_extraction_whisper.py +++ b/src/transformers/models/whisper/feature_extraction_whisper.py @@ -94,41 +94,63 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor): mel_scale="slaney", ) - def _np_extract_fbank_features(self, waveform: np.array) -> np.ndarray: + def _np_extract_fbank_features(self, waveform_batch: np.array, device: str) -> np.ndarray: """ Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch implementation with 1e-5 tolerance. """ - log_spec = spectrogram( - waveform, - window_function(self.n_fft, "hann"), - frame_length=self.n_fft, - hop_length=self.hop_length, - power=2.0, - mel_filters=self.mel_filters, - log_mel="log10", - ) - log_spec = log_spec[:, :-1] - log_spec = np.maximum(log_spec, log_spec.max() - 8.0) - log_spec = (log_spec + 4.0) / 4.0 - return log_spec + if device != "cpu": + raise ValueError( + f"Got device `{device}` for feature extraction, but feature extraction on CUDA accelerator " + "devices requires torch, which is not installed. Either set `device='cpu'`, or " + "install torch according to the official instructions: https://pytorch.org/get-started/locally/" + ) + log_spec_batch = [] + for waveform in waveform_batch: + log_spec = spectrogram( + waveform, + window_function(self.n_fft, "hann"), + frame_length=self.n_fft, + hop_length=self.hop_length, + power=2.0, + mel_filters=self.mel_filters, + log_mel="log10", + ) + log_spec = log_spec[:, :-1] + log_spec = np.maximum(log_spec, log_spec.max() - 8.0) + log_spec = (log_spec + 4.0) / 4.0 + log_spec_batch.append(log_spec) + log_spec_batch = np.array(log_spec_batch) + return log_spec_batch - def _torch_extract_fbank_features(self, waveform: np.array) -> np.ndarray: + def _torch_extract_fbank_features(self, waveform: np.array, device: str = "cpu") -> np.ndarray: """ - Compute the log-mel spectrogram of the provided audio using the PyTorch STFT implementation. + Compute the log-mel spectrogram of the audio using PyTorch's GPU-accelerated STFT implementation with batching, + yielding results similar to cpu computing with 1e-5 tolerance. """ waveform = torch.from_numpy(waveform).type(torch.float32) window = torch.hann_window(self.n_fft) + if device != "cpu": + waveform = waveform.to(device) + window = window.to(device) stft = torch.stft(waveform, self.n_fft, self.hop_length, window=window, return_complex=True) magnitudes = stft[..., :-1].abs() ** 2 mel_filters = torch.from_numpy(self.mel_filters).type(torch.float32) + if device != "cpu": + mel_filters = mel_filters.to(device) mel_spec = mel_filters.T @ magnitudes log_spec = torch.clamp(mel_spec, min=1e-10).log10() - log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) + if waveform.dim() == 2: + max_val = log_spec.max(dim=2, keepdim=True)[0].max(dim=1, keepdim=True)[0] + log_spec = torch.maximum(log_spec, max_val - 8.0) + else: + log_spec = torch.maximum(log_spec, log_spec.max() - 8.0) log_spec = (log_spec + 4.0) / 4.0 + if device != "cpu": + log_spec = log_spec.detach().cpu() return log_spec.numpy() @staticmethod @@ -165,6 +187,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor): max_length: Optional[int] = None, sampling_rate: Optional[int] = None, do_normalize: Optional[bool] = None, + device: Optional[str] = "cpu", **kwargs, ) -> BatchFeature: """ @@ -211,6 +234,9 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor): do_normalize (`bool`, *optional*, defaults to `False`): Whether or not to zero-mean unit-variance normalize the input. Normalizing can help to significantly improve the performance of the model. + device (`str`, *optional*, defaults to `'cpu'`): + Specifies the device for computation of the log-mel spectrogram of audio signals in the + `_torch_extract_fbank_features` method. (e.g., "cpu", "cuda") """ if sampling_rate is not None: @@ -272,7 +298,7 @@ class WhisperFeatureExtractor(SequenceFeatureExtractor): extract_fbank_features = ( self._torch_extract_fbank_features if is_torch_available() else self._np_extract_fbank_features ) - input_features = [extract_fbank_features(waveform) for waveform in input_features[0]] + input_features = extract_fbank_features(input_features[0], device) if isinstance(input_features[0], List): padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features] diff --git a/tests/models/whisper/test_feature_extraction_whisper.py b/tests/models/whisper/test_feature_extraction_whisper.py index 77c7a9be3d..8b1e25927e 100644 --- a/tests/models/whisper/test_feature_extraction_whisper.py +++ b/tests/models/whisper/test_feature_extraction_whisper.py @@ -24,7 +24,7 @@ import numpy as np from datasets import load_dataset from transformers import WhisperFeatureExtractor -from transformers.testing_utils import check_json_file_has_correct_format, require_torch +from transformers.testing_utils import check_json_file_has_correct_format, require_torch, require_torch_gpu from transformers.utils.import_utils import is_torch_available from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin @@ -207,6 +207,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. return [x["array"] for x in speech_samples] + @require_torch_gpu @require_torch def test_torch_integration(self): # fmt: off @@ -223,6 +224,7 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. input_speech = self._load_datasamples(1) feature_extractor = WhisperFeatureExtractor() input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEqual(input_features.shape, (1, 80, 3000)) self.assertTrue(torch.allclose(input_features[0, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4)) @@ -253,3 +255,37 @@ class WhisperFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. self.assertTrue(np.all(np.mean(audio) < 1e-3)) self.assertTrue(np.all(np.abs(np.var(audio) - 1) < 1e-3)) + + @require_torch_gpu + @require_torch + def test_torch_integration_batch(self): + # fmt: off + EXPECTED_INPUT_FEATURES = torch.tensor( + [ + [ + 0.1193, -0.0946, -0.1098, -0.0196, 0.0225, -0.0690, -0.1736, 0.0951, + 0.0971, -0.0817, -0.0702, 0.0162, 0.0260, 0.0017, -0.0192, -0.1678, + 0.0709, -0.1867, -0.0655, -0.0274, -0.0234, -0.1884, -0.0516, -0.0554, + -0.0274, -0.1425, -0.1423, 0.0837, 0.0377, -0.0854 + ], + [ + -0.4696, -0.0751, 0.0276, -0.0312, -0.0540, -0.0383, 0.1295, 0.0568, + -0.2071, -0.0548, 0.0389, -0.0316, -0.2346, -0.1068, -0.0322, 0.0475, + -0.1709, -0.0041, 0.0872, 0.0537, 0.0075, -0.0392, 0.0371, 0.0189, + -0.1522, -0.0270, 0.0744, 0.0738, -0.0245, -0.0667 + ], + [ + -0.2337, -0.0060, -0.0063, -0.2353, -0.0431, 0.1102, -0.1492, -0.0292, + 0.0787, -0.0608, 0.0143, 0.0582, 0.0072, 0.0101, -0.0444, -0.1701, + -0.0064, -0.0027, -0.0826, -0.0730, -0.0099, -0.0762, -0.0170, 0.0446, + -0.1153, 0.0960, -0.0361, 0.0652, 0.1207, 0.0277 + ] + ] + ) + # fmt: on + + input_speech = self._load_datasamples(3) + feature_extractor = WhisperFeatureExtractor() + input_features = feature_extractor(input_speech, return_tensors="pt").input_features + self.assertEqual(input_features.shape, (3, 80, 3000)) + self.assertTrue(torch.allclose(input_features[:, 0, :30], EXPECTED_INPUT_FEATURES, atol=1e-4)) From a907a903d644f835a5563b1133b1fc961e33c3d2 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Mon, 8 Apr 2024 16:36:53 +0800 Subject: [PATCH 039/110] Change log level to warning for num_train_epochs override (#30014) --- src/transformers/trainer.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 436165b0e3..99f4fa6b68 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -564,8 +564,8 @@ class Trainer: if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).") - if args.max_steps > 0: - logger.info("max_steps is given, it will override any value given in num_train_epochs") + if args.max_steps > 0 and args.num_train_epochs > 0: + logger.warning("max_steps is given, it will override any value given in num_train_epochs") if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0: raise ValueError( From 836e88caee95eb37a860a6c82bbd2becc6b9dc7b Mon Sep 17 00:00:00 2001 From: Howard Liberty Date: Mon, 8 Apr 2024 03:20:02 -0700 Subject: [PATCH 040/110] Make MLFlow version detection more robust and handles mlflow-skinny (#29957) * Make MLFlow version detection more robust and handles mlflow-skinny * Make function name more clear and refactor the logic * Further refactor --- src/transformers/integrations/integration_utils.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 45ef3c3c84..330fccb20d 100644 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -131,13 +131,6 @@ def is_mlflow_available(): return importlib.util.find_spec("mlflow") is not None -def get_mlflow_version(): - try: - return importlib.metadata.version("mlflow") - except importlib.metadata.PackageNotFoundError: - return importlib.metadata.version("mlflow-skinny") - - def is_dagshub_available(): return None not in [importlib.util.find_spec("dagshub"), importlib.util.find_spec("mlflow")] @@ -1005,12 +998,12 @@ class MLflowCallback(TrainerCallback): self._experiment_name = os.getenv("MLFLOW_EXPERIMENT_NAME", None) self._flatten_params = os.getenv("MLFLOW_FLATTEN_PARAMS", "FALSE").upper() in ENV_VARS_TRUE_VALUES self._run_id = os.getenv("MLFLOW_RUN_ID", None) - self._async_log = False + # "synchronous" flag is only available with mlflow version >= 2.8.0 # https://github.com/mlflow/mlflow/pull/9705 # https://github.com/mlflow/mlflow/releases/tag/v2.8.0 - if packaging.version.parse(get_mlflow_version()) >= packaging.version.parse("2.8.0"): - self._async_log = True + self._async_log = packaging.version.parse(self._ml_flow.__version__) >= packaging.version.parse("2.8.0") + logger.debug( f"MLflow experiment_name={self._experiment_name}, run_name={args.run_name}, nested={self._nested_run}," f" tags={self._nested_run}, tracking_uri={self._tracking_uri}" From 5e673ed2dc73249b0195ebea305dbad1e4b7cf2a Mon Sep 17 00:00:00 2001 From: Haz Sameen Shahgir <83033987+Patchwork53@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:41:28 +0600 Subject: [PATCH 041/110] updated examples/pytorch/language-modeling scripts and requirements.txt to require datasets>=2.14.0 (#30120) updated requirements.txt and require_version() calls in examples/pytorch/language-modeling to require datasets>=2.14.0 --- examples/pytorch/language-modeling/requirements.txt | 2 +- examples/pytorch/language-modeling/run_clm.py | 2 +- examples/pytorch/language-modeling/run_clm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_fim.py | 2 +- examples/pytorch/language-modeling/run_fim_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_mlm.py | 2 +- examples/pytorch/language-modeling/run_mlm_no_trainer.py | 2 +- examples/pytorch/language-modeling/run_plm.py | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/examples/pytorch/language-modeling/requirements.txt b/examples/pytorch/language-modeling/requirements.txt index 19c487fe3f..851e8de09c 100644 --- a/examples/pytorch/language-modeling/requirements.txt +++ b/examples/pytorch/language-modeling/requirements.txt @@ -1,6 +1,6 @@ accelerate >= 0.12.0 torch >= 1.3 -datasets >= 1.8.0 +datasets >= 2.14.0 sentencepiece != 0.1.92 protobuf evaluate diff --git a/examples/pytorch/language-modeling/run_clm.py b/examples/pytorch/language-modeling/run_clm.py index ce8558214e..7275e24c1b 100755 --- a/examples/pytorch/language-modeling/run_clm.py +++ b/examples/pytorch/language-modeling/run_clm.py @@ -58,7 +58,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.40.0.dev0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/language-modeling/run_clm_no_trainer.py b/examples/pytorch/language-modeling/run_clm_no_trainer.py index ec5ce23198..ba778d5cf8 100755 --- a/examples/pytorch/language-modeling/run_clm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py @@ -61,7 +61,7 @@ check_min_version("4.40.0.dev0") logger = get_logger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) diff --git a/examples/pytorch/language-modeling/run_fim.py b/examples/pytorch/language-modeling/run_fim.py index e9ce629f7d..201dfb3962 100644 --- a/examples/pytorch/language-modeling/run_fim.py +++ b/examples/pytorch/language-modeling/run_fim.py @@ -60,7 +60,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.40.0.dev0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) diff --git a/examples/pytorch/language-modeling/run_fim_no_trainer.py b/examples/pytorch/language-modeling/run_fim_no_trainer.py index 5bfc9e30b4..56b9af2461 100644 --- a/examples/pytorch/language-modeling/run_fim_no_trainer.py +++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py @@ -64,7 +64,7 @@ check_min_version("4.40.0.dev0") logger = get_logger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) diff --git a/examples/pytorch/language-modeling/run_mlm.py b/examples/pytorch/language-modeling/run_mlm.py index 474596c4f4..c831b74d64 100755 --- a/examples/pytorch/language-modeling/run_mlm.py +++ b/examples/pytorch/language-modeling/run_mlm.py @@ -57,7 +57,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.40.0.dev0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) diff --git a/examples/pytorch/language-modeling/run_mlm_no_trainer.py b/examples/pytorch/language-modeling/run_mlm_no_trainer.py index 5b56296bcd..81886db3c8 100755 --- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py +++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py @@ -60,7 +60,7 @@ from transformers.utils.versions import require_version check_min_version("4.40.0.dev0") logger = get_logger(__name__) -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys()) MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) diff --git a/examples/pytorch/language-modeling/run_plm.py b/examples/pytorch/language-modeling/run_plm.py index 943696c975..fd8112694f 100755 --- a/examples/pytorch/language-modeling/run_plm.py +++ b/examples/pytorch/language-modeling/run_plm.py @@ -50,7 +50,7 @@ from transformers.utils.versions import require_version # Will error if the minimal version of Transformers is not installed. Remove at your own risks. check_min_version("4.40.0.dev0") -require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") +require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt") logger = logging.getLogger(__name__) From d16f0abc3fed7c8c29050499e5252343d5fd6c57 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Mon, 8 Apr 2024 19:49:31 +0800 Subject: [PATCH 042/110] [tests] add `require_bitsandbytes` marker (#30116) * add bnb flag * move maker * add accelerator maker --- tests/models/persimmon/test_modeling_persimmon.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/models/persimmon/test_modeling_persimmon.py b/tests/models/persimmon/test_modeling_persimmon.py index 79cee8a648..86a69d774f 100644 --- a/tests/models/persimmon/test_modeling_persimmon.py +++ b/tests/models/persimmon/test_modeling_persimmon.py @@ -23,6 +23,7 @@ from parameterized import parameterized from transformers import PersimmonConfig, is_torch_available, set_seed from transformers.testing_utils import ( backend_empty_cache, + require_bitsandbytes, require_torch, require_torch_accelerator, require_torch_fp16, @@ -465,6 +466,8 @@ class PersimmonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester @require_torch class PersimmonIntegrationTest(unittest.TestCase): @slow + @require_torch_accelerator + @require_bitsandbytes def test_model_8b_chat_logits(self): input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] model = PersimmonForCausalLM.from_pretrained( @@ -492,6 +495,7 @@ class PersimmonIntegrationTest(unittest.TestCase): @slow @require_torch_accelerator @require_torch_fp16 + @require_bitsandbytes def test_model_8b_chat_greedy_generation(self): EXPECTED_TEXT_COMPLETION = """human: Simply put, the theory of relativity states that?\n\nadept: The theory of relativity states that the laws of physics are the same for all observers, regardless of their relative motion.""" prompt = "human: Simply put, the theory of relativity states that?\n\nadept:" From f5658732d5bd38657b6cf3138f69373a94866c61 Mon Sep 17 00:00:00 2001 From: JINO ROHIT Date: Mon, 8 Apr 2024 17:19:59 +0530 Subject: [PATCH 043/110] fixing issue 30034 - adding data format for run_ner.py (#30088) --- examples/pytorch/token-classification/README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/examples/pytorch/token-classification/README.md b/examples/pytorch/token-classification/README.md index 568e5242fe..b880b82030 100644 --- a/examples/pytorch/token-classification/README.md +++ b/examples/pytorch/token-classification/README.md @@ -25,6 +25,20 @@ customize it to your needs if you need extra processing on your datasets. It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for training and validation, you might just need to add some tweaks in the data preprocessing. +### Using your own data + +If you use your own data, the script expects the following format of the data - + +```bash +{ + "chunk_tags": [11, 12, 12, 21, 13, 11, 11, 21, 13, 11, 12, 13, 11, 21, 22, 11, 12, 17, 11, 21, 17, 11, 12, 12, 21, 22, 22, 13, 11, 0], + "id": "0", + "ner_tags": [0, 3, 4, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], + "pos_tags": [12, 22, 22, 38, 15, 22, 28, 38, 15, 16, 21, 35, 24, 35, 37, 16, 21, 15, 24, 41, 15, 16, 21, 21, 20, 37, 40, 35, 21, 7], + "tokens": ["The", "European", "Commission", "said", "on", "Thursday", "it", "disagreed", "with", "German", "advice", "to", "consumers", "to", "shun", "British", "lamb", "until", "scientists", "determine", "whether", "mad", "cow", "disease", "can", "be", "transmitted", "to", "sheep", "."] +} +``` + The following example fine-tunes BERT on CoNLL-2003: ```bash From 7f9aff910b8fb0b1a2cec97298d1b121ef35d78a Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 8 Apr 2024 13:29:20 +0100 Subject: [PATCH 044/110] Patch fix - don't use safetensors for TF models (#30118) * Patch fix - don't use safetensors for TF models * Skip test for TF for now * Update for another test --- tests/generation/test_framework_agnostic.py | 8 ++++---- tests/generation/test_tf_utils.py | 2 +- tests/pipelines/test_pipelines_text_generation.py | 1 + 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py index f4f13dd8d5..85a58bdf28 100644 --- a/tests/generation/test_framework_agnostic.py +++ b/tests/generation/test_framework_agnostic.py @@ -111,7 +111,7 @@ class GenerationIntegrationTestsMixin: article = """Justin Timberlake.""" gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") - gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") + gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids if is_pt: gpt2_model = gpt2_model.to(torch_device) @@ -582,7 +582,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) @@ -611,7 +611,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) @@ -638,7 +638,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py index f40ceebef7..73eba05ece 100644 --- a/tests/generation/test_tf_utils.py +++ b/tests/generation/test_tf_utils.py @@ -194,7 +194,7 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors="tf") - model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=False) eos_token_id = 638 # forces the generation to happen on CPU, to avoid GPU-related quirks diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index ada04c7dbe..318526b850 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -268,6 +268,7 @@ class TextGenerationPipelineTests(unittest.TestCase): text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer) return text_generator, ["This is a test", "Another test"] + @require_torch # See https://github.com/huggingface/transformers/issues/30117 def test_stop_sequence_stopping_criteria(self): prompt = """Hello I believe in""" text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2") From 0201f6420b186664095844666b565dd0c5c897b3 Mon Sep 17 00:00:00 2001 From: Utkarsha Gupte <89600822+UtkarshaGupte@users.noreply.github.com> Date: Mon, 8 Apr 2024 06:21:16 -0700 Subject: [PATCH 045/110] [#29174] ImportError Fix: Trainer with PyTorch requires accelerate>=0.20.1 Fix (#29888) * ImportError: Trainer with PyTorch requires accelerate>=0.20.1 Fix Adding the evaluate and accelerate installs at the beginning of the cell to fix the issue * ImportError Fix: Trainer with PyTorch requires accelerate>=0.20.1 * Import Error Fix * Update installation.md * Update quicktour.md * rollback other lang changes * Update _config.py * updates for other languages * fixing error * Tutorial Update * Update tokenization_utils_base.py * Just use an optimizer string to pass the doctest? --------- Co-authored-by: Matt --- docs/source/_config.py | 2 +- docs/source/de/_config.py | 2 +- docs/source/en/_config.py | 2 +- docs/source/en/quicktour.md | 4 ++-- docs/source/es/_config.py | 2 +- docs/source/fr/_config.py | 2 +- docs/source/fr/quicktour.md | 2 +- docs/source/it/_config.py | 2 +- docs/source/ja/quicktour.md | 2 +- docs/source/ko/_config.py | 2 +- docs/source/ko/quicktour.md | 2 +- docs/source/pt/_config.py | 2 +- docs/source/te/quicktour.md | 2 +- docs/source/zh/quicktour.md | 2 +- 14 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/source/_config.py b/docs/source/_config.py index d26d908aa2..f49e4e4731 100644 --- a/docs/source/_config.py +++ b/docs/source/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers installation -! pip install transformers datasets evaluate +! pip install transformers datasets evaluate accelerate # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/de/_config.py b/docs/source/de/_config.py index a6d75853f5..f49e4e4731 100644 --- a/docs/source/de/_config.py +++ b/docs/source/de/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers installation -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/en/_config.py b/docs/source/en/_config.py index a6d75853f5..f49e4e4731 100644 --- a/docs/source/en/_config.py +++ b/docs/source/en/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers installation -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/en/quicktour.md b/docs/source/en/quicktour.md index 904e0bbc74..9f8ae15700 100644 --- a/docs/source/en/quicktour.md +++ b/docs/source/en/quicktour.md @@ -23,7 +23,7 @@ Get up and running with 🤗 Transformers! Whether you're a developer or an ever Before you begin, make sure you have all the necessary libraries installed: ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` You'll also need to install your preferred machine learning framework: @@ -547,7 +547,7 @@ All models are a standard [`tf.keras.Model`](https://www.tensorflow.org/api_docs ```py >>> from tensorflow.keras.optimizers import Adam - >>> model.compile(optimizer=Adam(3e-5)) # No loss argument! + >>> model.compile(optimizer='adam') # No loss argument! >>> model.fit(tf_dataset) # doctest: +SKIP ``` diff --git a/docs/source/es/_config.py b/docs/source/es/_config.py index a6d75853f5..f49e4e4731 100644 --- a/docs/source/es/_config.py +++ b/docs/source/es/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers installation -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/fr/_config.py b/docs/source/fr/_config.py index 07f1de5f7d..f3f59bf520 100644 --- a/docs/source/fr/_config.py +++ b/docs/source/fr/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Installation de Transformers -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # Pour installer à partir du code source au lieu de la dernière version, commentez la commande ci-dessus et décommentez la suivante. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/fr/quicktour.md b/docs/source/fr/quicktour.md index f76764f103..99a53afdaa 100644 --- a/docs/source/fr/quicktour.md +++ b/docs/source/fr/quicktour.md @@ -23,7 +23,7 @@ Soyez opérationnel avec 🤗 Transformers ! Que vous soyez un développeur ou u Avant de commencer, assurez-vous que vous avez installé toutes les bibliothèques nécessaires : ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` Vous aurez aussi besoin d'installer votre bibliothèque d'apprentissage profond favorite : diff --git a/docs/source/it/_config.py b/docs/source/it/_config.py index b05ae95c03..72b362f9a7 100644 --- a/docs/source/it/_config.py +++ b/docs/source/it/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Installazione di Transformers -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # Per installare dalla fonte invece dell'ultima versione rilasciata, commenta il comando sopra e # rimuovi la modalità commento al comando seguente. # ! pip install git+https://github.com/huggingface/transformers.git diff --git a/docs/source/ja/quicktour.md b/docs/source/ja/quicktour.md index 3bec2f827a..6e6d19dc37 100644 --- a/docs/source/ja/quicktour.md +++ b/docs/source/ja/quicktour.md @@ -26,7 +26,7 @@ specific language governing permissions and limitations under the License. 始める前に、必要なライブラリがすべてインストールされていることを確認してください: ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` あなたはまた、好きな機械学習フレームワークをインストールする必要があります: diff --git a/docs/source/ko/_config.py b/docs/source/ko/_config.py index 9bdfef7af9..ab61af6ef9 100644 --- a/docs/source/ko/_config.py +++ b/docs/source/ko/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers 설치 방법 -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # 마지막 릴리스 대신 소스에서 설치하려면, 위 명령을 주석으로 바꾸고 아래 명령을 해제하세요. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/ko/quicktour.md b/docs/source/ko/quicktour.md index c92279fa91..312ae26b58 100644 --- a/docs/source/ko/quicktour.md +++ b/docs/source/ko/quicktour.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. 시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요: ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` 또한 선호하는 머신 러닝 프레임워크를 설치해야 합니다: diff --git a/docs/source/pt/_config.py b/docs/source/pt/_config.py index a6d75853f5..f49e4e4731 100644 --- a/docs/source/pt/_config.py +++ b/docs/source/pt/_config.py @@ -1,7 +1,7 @@ # docstyle-ignore INSTALL_CONTENT = """ # Transformers installation -! pip install transformers datasets +! pip install transformers datasets evaluate accelerate # To install from source instead of the last release, comment the command above and uncomment the following one. # ! pip install git+https://github.com/huggingface/transformers.git """ diff --git a/docs/source/te/quicktour.md b/docs/source/te/quicktour.md index 75efa84112..a8ce5617a1 100644 --- a/docs/source/te/quicktour.md +++ b/docs/source/te/quicktour.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. మీరు ప్రారంభించడానికి ముందు, మీరు అవసరమైన అన్ని లైబ్రరీలను ఇన్‌స్టాల్ చేశారని నిర్ధారించుకోండి: ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` మీరు మీ ప్రాధాన్య యంత్ర అభ్యాస ఫ్రేమ్‌వర్క్‌ను కూడా ఇన్‌స్టాల్ చేయాలి: diff --git a/docs/source/zh/quicktour.md b/docs/source/zh/quicktour.md index c23a38ab5f..036a27f423 100644 --- a/docs/source/zh/quicktour.md +++ b/docs/source/zh/quicktour.md @@ -23,7 +23,7 @@ rendered properly in your Markdown viewer. 在开始之前,确保你已经安装了所有必要的库: ```bash -!pip install transformers datasets +!pip install transformers datasets evaluate accelerate ``` 你还需要安装喜欢的机器学习框架: From 08c84433079439f41acce6da88c44cebcdbffbc7 Mon Sep 17 00:00:00 2001 From: Nick Doiron Date: Mon, 8 Apr 2024 09:51:11 -0400 Subject: [PATCH 046/110] Accept token in trainer.push_to_hub() (#30093) * pass token to trainer.push_to_hub * fmt * Update src/transformers/trainer.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * pass token to create_repo, update_folder --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/trainer.py | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 99f4fa6b68..16fb65e483 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3909,7 +3909,7 @@ class Trainer: else: return 0 - def init_hf_repo(self): + def init_hf_repo(self, token: Optional[str] = None): """ Initializes a git repo in `self.args.hub_model_id`. """ @@ -3922,7 +3922,8 @@ class Trainer: else: repo_name = self.args.hub_model_id - repo_url = create_repo(repo_name, token=self.args.hub_token, private=self.args.hub_private_repo, exist_ok=True) + token = token if token is not None else self.args.hub_token + repo_url = create_repo(repo_name, token=token, private=self.args.hub_private_repo, exist_ok=True) self.hub_model_id = repo_url.repo_id self.push_in_progress = None @@ -4067,7 +4068,13 @@ class Trainer: logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.") self.push_in_progress.wait_until_done() - def push_to_hub(self, commit_message: Optional[str] = "End of training", blocking: bool = True, **kwargs) -> str: + def push_to_hub( + self, + commit_message: Optional[str] = "End of training", + blocking: bool = True, + token: Optional[str] = None, + **kwargs, + ) -> str: """ Upload `self.model` and `self.tokenizer` or `self.image_processor` to the 🤗 model hub on the repo `self.args.hub_model_id`. @@ -4076,6 +4083,8 @@ class Trainer: Message to commit while pushing. blocking (`bool`, *optional*, defaults to `True`): Whether the function should return only when the `git push` has finished. + token (`str`, *optional*, defaults to `None`): + Token with write permission to overwrite Trainer's original args. kwargs (`Dict[str, Any]`, *optional*): Additional keyword arguments passed along to [`~Trainer.create_model_card`]. @@ -4089,10 +4098,11 @@ class Trainer: model_name = Path(self.args.output_dir).name else: model_name = self.args.hub_model_id.split("/")[-1] + token = token if token is not None else self.args.hub_token # In case the user calls this method with args.push_to_hub = False if self.hub_model_id is None: - self.init_hf_repo() + self.init_hf_repo(token=token) # Needs to be executed on all processes for TPU training, but will only save on the processed determined by # self.args.should_save. @@ -4125,7 +4135,7 @@ class Trainer: repo_id=self.hub_model_id, folder_path=self.args.output_dir, commit_message=commit_message, - token=self.args.hub_token, + token=token, run_as_future=not blocking, ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"], ) From 1773afcec338c2b1a741a86b7431ad10be4518c7 Mon Sep 17 00:00:00 2001 From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> Date: Mon, 8 Apr 2024 15:54:12 +0200 Subject: [PATCH 047/110] fix learning rate display in trainer when using galore optimizer (#30085) fix learning rate display issue in galore optimizer --- src/transformers/trainer_pt_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer_pt_utils.py b/src/transformers/trainer_pt_utils.py index 47ed90bed4..9ee670e942 100644 --- a/src/transformers/trainer_pt_utils.py +++ b/src/transformers/trainer_pt_utils.py @@ -1279,7 +1279,7 @@ class LayerWiseDummyOptimizer(torch.optim.Optimizer): def __init__(self, optimizer_dict=None, *args, **kwargs): dummy_tensor = torch.randn(1, 1) self.optimizer_dict = optimizer_dict - super().__init__([dummy_tensor], {"lr": 1e-03}) + super().__init__([dummy_tensor], {"lr": kwargs.get("lr", 1e-03)}) def zero_grad(self, set_to_none: bool = True) -> None: pass From 1897874edcba58e0e585305ca71f6ba727315219 Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Mon, 8 Apr 2024 16:25:07 +0200 Subject: [PATCH 048/110] Fix falcon with SDPA, alibi but no passed mask (#30123) * fix falcon without attention_mask & alibi * add test * Update tests/models/falcon/test_modeling_falcon.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .../models/falcon/modeling_falcon.py | 24 ++++++++----------- tests/models/falcon/test_modeling_falcon.py | 24 +++++++++++++++++++ 2 files changed, 34 insertions(+), 14 deletions(-) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index f1cff3f181..c2f3f91d5c 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -1098,27 +1098,23 @@ class FalconModel(FalconPreTrainedModel): elif head_mask is None: alibi = alibi.reshape(batch_size, -1, *alibi.shape[1:]) - attention_mask_2d = attention_mask # We don't call _prepare_4d_causal_attention_mask_for_sdpa as we need to mask alibi using the 4D attention_mask untouched. attention_mask = _prepare_4d_causal_attention_mask( attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length ) # We take care to integrate alibi bias in the attention_mask here. - if attention_mask_2d is None: - attention_mask = alibi / math.sqrt(self.config.hidden_size // self.num_heads) - else: - min_dtype = torch.finfo(alibi.dtype).min - attention_mask = torch.masked_fill( - alibi / math.sqrt(self.config.hidden_size // self.num_heads), - attention_mask < -1, - min_dtype, - ) + min_dtype = torch.finfo(alibi.dtype).min + attention_mask = torch.masked_fill( + alibi / math.sqrt(self.config.hidden_size // self.num_heads), + attention_mask < -1, + min_dtype, + ) - # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend - # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213 - if seq_length > 1 and attention_mask.device.type == "cuda": - attention_mask = AttentionMaskConverter._unmask_unattended(attention_mask, min_dtype=min_dtype) + # From PyTorch 2.1 onwards, F.scaled_dot_product_attention with the memory-efficient attention backend + # produces nans if sequences are completely unattended in the attention mask. Details: https://github.com/pytorch/pytorch/issues/110213 + if seq_length > 1 and attention_mask.device.type == "cuda": + attention_mask = AttentionMaskConverter._unmask_unattended(attention_mask, min_dtype=min_dtype) else: # PyTorch SDPA does not support head_mask, we fall back on the eager implementation in this case. attention_mask = _prepare_4d_causal_attention_mask( diff --git a/tests/models/falcon/test_modeling_falcon.py b/tests/models/falcon/test_modeling_falcon.py index 35708d868d..c8ad2d9b32 100644 --- a/tests/models/falcon/test_modeling_falcon.py +++ b/tests/models/falcon/test_modeling_falcon.py @@ -666,3 +666,27 @@ class FalconLanguageGenerationTest(unittest.TestCase): self.assertLess(unpadded_inputs.input_ids.shape[-1], padded_inputs.input_ids.shape[-1]) # left-padding exists self.assertEqual(unpadded_gen_text[0], expected_output) self.assertEqual(padded_gen_text[0], expected_output) + + @slow + @require_torch_sdpa + def test_falcon_alibi_sdpa_matches_eager(self): + input_ids = torch.randint(0, 1000, (5, 20)) + + config = FalconConfig( + vocab_size=1000, + hidden_size=64, + num_hidden_layers=3, + num_attention_heads=4, + new_decoder_architecture=True, + alibi=True, + ) + + falcon = FalconForCausalLM(config) + falcon = falcon.eval() + + with torch.no_grad(): + # output_attentions=True dispatches to eager path + falcon_output_eager = falcon(input_ids, output_attentions=True)[0] + falcon_output_sdpa = falcon(input_ids)[0] + + self.assertTrue(torch.allclose(falcon_output_eager, falcon_output_sdpa, atol=1e-3)) From a71def025c374ca2f1f429e4407beb1742f56303 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Mon, 8 Apr 2024 16:57:38 +0200 Subject: [PATCH 049/110] Trainer / Core : Do not change init signature order (#30126) * Update trainer.py * fix copies --- src/transformers/trainer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 16fb65e483..227e92fa63 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -304,9 +304,6 @@ class Trainer: The tokenizer used to preprocess the data. If provided, will be used to automatically pad the inputs to the maximum length when batching inputs, and it will be saved along the model to make it easier to rerun an interrupted training or reuse the fine-tuned model. - image_processor ([`BaseImageProcessor`], *optional*): - The image processor used to preprocess the data. If provided, it will be saved along the model to make it easier - to rerun an interrupted training or reuse the fine-tuned model. model_init (`Callable[[], PreTrainedModel]`, *optional*): A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start from a new instance of the model as given by this function. @@ -331,6 +328,9 @@ class Trainer: by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. + image_processor ([`BaseImageProcessor`], *optional*): + The image processor used to preprocess the data. If provided, it will be saved along the model to make it easier + to rerun an interrupted training or reuse the fine-tuned model. Important attributes: @@ -361,12 +361,12 @@ class Trainer: train_dataset: Optional[Union[Dataset, IterableDataset]] = None, eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, - image_processor: Optional["BaseImageProcessor"] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + image_processor: Optional["BaseImageProcessor"] = None, ): if args is None: output_dir = "tmp_trainer" From ef38e2a7e550d062c382b6c6b8ed2e51e6ad27ae Mon Sep 17 00:00:00 2001 From: fxmarty <9808326+fxmarty@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:10:06 +0200 Subject: [PATCH 050/110] Make vitdet jit trace complient (#30065) * remove controlflows * style * rename patch_ to padded_ following review comment * style --- .../models/vitdet/modeling_vitdet.py | 35 +++++++++++-------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/src/transformers/models/vitdet/modeling_vitdet.py b/src/transformers/models/vitdet/modeling_vitdet.py index 5d12b0b585..985f00b7e5 100644 --- a/src/transformers/models/vitdet/modeling_vitdet.py +++ b/src/transformers/models/vitdet/modeling_vitdet.py @@ -94,11 +94,12 @@ class VitDetEmbeddings(nn.Module): if has_cls_token: abs_pos_embeddings = abs_pos_embeddings[:, 1:] num_position = abs_pos_embeddings.shape[1] - size = int(math.sqrt(num_position)) + size = int(math.sqrt(num_position)) # This is a constant and can be recorded as such in the ONNX export. if size * size != num_position: raise ValueError("Absolute position embeddings must be a square number.") - if size != height or size != width: + if torch.jit.is_tracing() or (size != height or size != width): + # nn.functional.interpolate is a noop in case size == height and size == width - we need to always capture this path with jit.trace. new_abs_pos_embeddings = nn.functional.interpolate( abs_pos_embeddings.reshape(1, size, size, -1).permute(0, 3, 1, 2), size=(height, width), @@ -132,6 +133,7 @@ class VitDetEmbeddings(nn.Module): return embeddings +@torch.jit.script_if_tracing # nn.functional.interpolate's `size` needs to be dynamic. def get_rel_pos(q_size, k_size, rel_pos): """ Get relative positional embeddings according to the relative positions of query and key sizes. @@ -399,21 +401,23 @@ def window_partition(hidden_state, window_size): Returns: `tuple(torch.FloatTensor)` comprising various elements: - windows: windows after partition with [batch_size * num_windows, window_size, window_size, num_channels]. - - (patch_height, patch_width): padded height and width before partition + - (padded_height, padded_width): padded height and width before partition """ batch_size, height, width, num_channels = hidden_state.shape pad_height = (window_size - height % window_size) % window_size pad_width = (window_size - width % window_size) % window_size - if pad_height > 0 or pad_width > 0: - hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height)) - patch_height, patch_width = height + pad_height, width + pad_width + + # Noop in case pad_width == 0 and pad_height == 0. + hidden_state = nn.functional.pad(hidden_state, (0, 0, 0, pad_width, 0, pad_height)) + + padded_height, padded_width = height + pad_height, width + pad_width hidden_state = hidden_state.view( - batch_size, patch_height // window_size, window_size, patch_width // window_size, window_size, num_channels + batch_size, padded_height // window_size, window_size, padded_width // window_size, window_size, num_channels ) windows = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, num_channels) - return windows, (patch_height, patch_width) + return windows, (padded_height, padded_width) def window_unpartition(windows, window_size, pad_height_width, height_width): @@ -426,23 +430,24 @@ def window_unpartition(windows, window_size, pad_height_width, height_width): window_size (`int`): Window size. pad_height_width (`Tuple[int]`): - Padded height and width (patch_height, patch_width). + Padded height and width (padded_height, padded_width). height_width (`Tuple[int]`): Original height and width before padding. Returns: hidden_state: unpartitioned sequences with [batch_size, height, width, num_channels]. """ - patch_height, patch_width = pad_height_width + padded_height, padded_width = pad_height_width height, width = height_width - batch_size = windows.shape[0] // (patch_height * patch_width // window_size // window_size) + batch_size = windows.shape[0] // (padded_height * padded_width // window_size // window_size) hidden_state = windows.view( - batch_size, patch_height // window_size, patch_width // window_size, window_size, window_size, -1 + batch_size, padded_height // window_size, padded_width // window_size, window_size, window_size, -1 ) - hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous().view(batch_size, patch_height, patch_width, -1) + hidden_state = hidden_state.permute(0, 1, 3, 2, 4, 5).contiguous() + hidden_state = hidden_state.view(batch_size, padded_height, padded_width, -1) - if patch_height > height or patch_width > width: - hidden_state = hidden_state[:, :height, :width, :].contiguous() + # We always have height <= padded_height and width <= padded_width + hidden_state = hidden_state[:, :height, :width, :].contiguous() return hidden_state From 7afade208695f1a25dcc9656d553d51ace67264e Mon Sep 17 00:00:00 2001 From: DrAnaximandre Date: Mon, 8 Apr 2024 12:45:21 -0400 Subject: [PATCH 051/110] Fix typo at ImportError (#30090) fix typo at ImportError --- src/transformers/quantizers/quantizer_quanto.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/transformers/quantizers/quantizer_quanto.py b/src/transformers/quantizers/quantizer_quanto.py index 7285124950..e7e2219ab6 100644 --- a/src/transformers/quantizers/quantizer_quanto.py +++ b/src/transformers/quantizers/quantizer_quanto.py @@ -60,7 +60,9 @@ class QuantoHfQuantizer(HfQuantizer): if not is_quanto_available(): raise ImportError("Loading a quanto quantized model requires quanto library (`pip install quanto`)") if not is_accelerate_available(): - raise ImportError("Loading a quanto quantized model requires accelerate library (`pip install quanto`)") + raise ImportError( + "Loading a quanto quantized model requires accelerate library (`pip install accelerate`)" + ) def update_device_map(self, device_map): if device_map is None: From 8c00b53eb01051e93942e99912455883b7ea47b2 Mon Sep 17 00:00:00 2001 From: Felix Hirwa Nshuti Date: Mon, 8 Apr 2024 22:37:30 +0530 Subject: [PATCH 052/110] Adding `mps` as device for `Pipeline` class (#30080) * adding env variable for mps and is_torch_mps_available for Pipeline * fix linting errors * Remove environment overide Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/pipelines/base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index fa1f2fcf5d..7225a6136e 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -42,6 +42,7 @@ from ..utils import ( is_torch_available, is_torch_cuda_available, is_torch_mlu_available, + is_torch_mps_available, is_torch_npu_available, is_torch_xpu_available, logging, @@ -860,6 +861,8 @@ class Pipeline(_ScikitCompat): self.device = torch.device(f"npu:{device}") elif is_torch_xpu_available(check_device=True): self.device = torch.device(f"xpu:{device}") + elif is_torch_mps_available(): + self.device = torch.device(f"mps:{device}") else: raise ValueError(f"{device} unrecognized or not available.") else: From 2f12e408225b1ebceb0d2f701ce419d46678dc31 Mon Sep 17 00:00:00 2001 From: Jonathan Tow <41410219+jon-tow@users.noreply.github.com> Date: Mon, 8 Apr 2024 17:51:58 -0400 Subject: [PATCH 053/110] [`StableLm`] Add QK normalization and Parallel Residual Support (#29745) * init: add StableLm 2 support * add integration test for parallel residual and qk layernorm * update(modeling): match qk norm naming for consistency with phi/persimmon * fix(tests): run fwd/bwd on random init test model to jitter norm weights off identity * `use_parallel_residual`: add copy pointer to `GPTNeoXLayer.forward` * refactor: rename head states var in `StableLmLayerNormPerHead` * tests: update test model and add generate check --- .../models/stablelm/configuration_stablelm.py | 9 +++ .../models/stablelm/modeling_stablelm.py | 64 ++++++++++++++++--- .../models/stablelm/test_modeling_stablelm.py | 34 ++++++++++ 3 files changed, 97 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/stablelm/configuration_stablelm.py b/src/transformers/models/stablelm/configuration_stablelm.py index f1e3ab4517..beb4af4d84 100644 --- a/src/transformers/models/stablelm/configuration_stablelm.py +++ b/src/transformers/models/stablelm/configuration_stablelm.py @@ -83,6 +83,11 @@ class StableLmConfig(PretrainedConfig): is an experimental feature, subject to breaking API changes in future versions. use_qkv_bias (`bool`, *optional*, defaults to `False`): Whether or not the model should use bias for qkv layers. + qk_layernorm (`bool`, *optional*, defaults to `False`): + Whether or not to normalize, per head, the Queries and Keys after projecting the hidden states. + use_parallel_residual (`bool`, *optional*, defaults to `False`): + Whether to use a "parallel" formulation in each Transformer layer, which can provide a slight training + speedup at large scales. hidden_dropout (`float`, *optional*, defaults to 0.0): The dropout ratio after applying the MLP to the hidden states. attention_dropout (`float`, *optional*, defaults to 0.0): @@ -123,6 +128,8 @@ class StableLmConfig(PretrainedConfig): rope_theta=10_000, rope_scaling=None, use_qkv_bias=False, + qk_layernorm=False, + use_parallel_residual=False, hidden_dropout=0.0, attention_dropout=0.0, partial_rotary_factor=0.25, @@ -146,6 +153,8 @@ class StableLmConfig(PretrainedConfig): self.rope_theta = rope_theta self.rope_scaling = rope_scaling self.use_qkv_bias = use_qkv_bias + self.qk_layernorm = qk_layernorm + self.use_parallel_residual = use_parallel_residual self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout self.partial_rotary_factor = partial_rotary_factor diff --git a/src/transformers/models/stablelm/modeling_stablelm.py b/src/transformers/models/stablelm/modeling_stablelm.py index 76aca7bae9..3262f2cd3c 100755 --- a/src/transformers/models/stablelm/modeling_stablelm.py +++ b/src/transformers/models/stablelm/modeling_stablelm.py @@ -203,6 +203,21 @@ class StableLmMLP(nn.Module): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) +class StableLmLayerNormPerHead(nn.Module): + def __init__(self, dim, num_heads, eps=1e-5, bias=False): + super().__init__() + self.dim = dim + self.num_heads = num_heads + self.norms = nn.ModuleList([nn.LayerNorm(dim, eps=eps, bias=bias) for _ in range(self.num_heads)]) + + def forward(self, hidden_states: torch.Tensor): + # Split along the num_heads axis to get per-head inputs + # [batch_size, num_heads, seq_len, head_dim] -> [batch_size, 1, seq_len, head_dim] * num_heads + states_per_heads = torch.split(hidden_states, 1, dim=1) + # Normalize and merge the heads back together + return torch.cat([norm(hidden_states) for norm, hidden_states in zip(self.norms, states_per_heads)], dim=1) + + # Copied from transformers.models.llama.modeling_llama.repeat_kv def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: """ @@ -250,6 +265,13 @@ class StableLmAttention(nn.Module): self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.use_qkv_bias) self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.qk_layernorm = config.qk_layernorm + if self.qk_layernorm: + self.q_layernorm = StableLmLayerNormPerHead(self.head_dim, self.num_heads, eps=config.layer_norm_eps) + self.k_layernorm = StableLmLayerNormPerHead( + self.head_dim, self.num_key_value_heads, eps=config.layer_norm_eps + ) + self.attention_dropout = nn.Dropout(config.attention_dropout) self._init_rope() @@ -300,6 +322,10 @@ class StableLmAttention(nn.Module): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: @@ -409,6 +435,10 @@ class StableLmSdpaAttention(StableLmAttention): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: @@ -513,6 +543,10 @@ class StableLmFlashAttention2(StableLmAttention): key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + if self.qk_layernorm: + query_states = self.q_layernorm(query_states) + key_states = self.k_layernorm(key_states) + kv_seq_len = key_states.shape[-2] if past_key_value is not None: if self.layer_idx is None: @@ -678,11 +712,14 @@ ATTENTION_CLASSES = { class StableLmDecoderLayer(nn.Module): def __init__(self, config: StableLmConfig, layer_idx: int): super().__init__() + self.use_parallel_residual = config.use_parallel_residual self.hidden_size = config.hidden_size self.self_attn = ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) self.mlp = StableLmMLP(config) self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.post_attention_layernorm = None + if not self.use_parallel_residual: + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.dropout = nn.Dropout(config.hidden_dropout) def forward( @@ -719,7 +756,7 @@ class StableLmDecoderLayer(nn.Module): hidden_states = self.input_layernorm(hidden_states) # Self Attention - hidden_states, self_attn_weights, present_key_value = self.self_attn( + self_attn_output, self_attn_weights, present_key_value = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, position_ids=position_ids, @@ -727,15 +764,22 @@ class StableLmDecoderLayer(nn.Module): output_attentions=output_attentions, use_cache=use_cache, ) - hidden_states = residual + hidden_states - # Fully Connected - residual = hidden_states - hidden_states = self.post_attention_layernorm(hidden_states) - hidden_states = self.mlp(hidden_states) - - hidden_states = self.dropout(hidden_states) - hidden_states = hidden_states + residual + # copied from transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXLayer.forward + if self.use_parallel_residual: + # x = x + attn(ln1(x)) + mlp(ln1(x)) + # Fully Connected + mlp_output = self.mlp(hidden_states) + mlp_output = self.dropout(mlp_output) + hidden_states = residual + self_attn_output + mlp_output + else: + # x = x + attn(ln1(x)) + # x = x + mlp(ln2(x)) + residual = residual + self_attn_output + # Fully Connected + mlp_output = self.mlp(self.post_attention_layernorm(residual)) + mlp_output = self.dropout(mlp_output) + hidden_states = residual + mlp_output outputs = (hidden_states,) diff --git a/tests/models/stablelm/test_modeling_stablelm.py b/tests/models/stablelm/test_modeling_stablelm.py index 0f01fa8fca..f5e74ead9b 100644 --- a/tests/models/stablelm/test_modeling_stablelm.py +++ b/tests/models/stablelm/test_modeling_stablelm.py @@ -483,6 +483,40 @@ class StableLmModelIntegrationTest(unittest.TestCase): EXPECTED_TEXT_COMPLETION = """My favorite food has always been pizza, but lately I’ve been craving something different. I’ve been trying to eat healthier and I’ve""" self.assertEqual(text, EXPECTED_TEXT_COMPLETION) + @slow + def test_model_tiny_random_stablelm_2_logits(self): + # Check parallel residual and qk layernorm forward pass + input_ids = {"input_ids": torch.tensor([[510, 8588, 310, 1900, 9386]], dtype=torch.long, device=torch_device)} + + model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2").to(torch_device) + model.eval() + + output = model(**input_ids).logits + + # Expected mean on dim = -1 + EXPECTED_MEAN = torch.tensor([[-2.7196, -3.6099, -2.6877, -3.1973, -3.9344]]).to(torch_device) + self.assertTrue(torch.allclose(output.mean(dim=-1), EXPECTED_MEAN, atol=1e-4, rtol=1e-4)) + + # Expected logits sliced from [0, 0, 0:30] + EXPECTED_SLICE = torch.tensor([2.8364, 5.3811, 5.1659, 7.5485, 4.3219, 6.3315, 1.3967, 6.9147, 3.9679, 6.4786, 5.9176, 3.3067, 5.2917, 0.1485, 3.9630, 7.9947,10.6727, 9.6757, 8.8772, 8.3527, 7.8445, 6.6025, 5.5786, 7.0985,6.1369, 3.4259, 1.9397, 4.6157, 4.8105, 3.1768]).to(torch_device) # fmt: skip + self.assertTrue(torch.allclose(output[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4)) + + @slow + def test_model_tiny_random_stablelm_2_generation(self): + # Check parallel residual and qk layernorm generation + tokenizer = AutoTokenizer.from_pretrained("stabilityai/tiny-random-stablelm-2") + model = StableLmForCausalLM.from_pretrained("stabilityai/tiny-random-stablelm-2") + input_ids = tokenizer.encode( + "My favorite ride at the amusement park", + return_tensors="pt", + ) + + outputs = model.generate(input_ids, max_new_tokens=20, temperature=0) + text = tokenizer.decode(outputs[0], skip_special_tokens=True) + + EXPECTED_TEXT_COMPLETION = """My favorite ride at the amusement park is the 2000-mile roller coaster. It's a thrilling ride filled with roller coast""" + self.assertEqual(text, EXPECTED_TEXT_COMPLETION) + @require_bitsandbytes @slow @require_flash_attn From 4e3490f79b40248c53ee54365a9662611e880892 Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:01:47 +0530 Subject: [PATCH 054/110] Fix failing DeepSpeed model zoo tests (#30112) * fix sequence length errors * fix label column name error for vit * fix the lm_head embedding!=linear layer mismatches for Seq2Seq models --- src/transformers/modeling_utils.py | 5 ++++- tests/deepspeed/test_model_zoo.py | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index fd0afa521a..9f22333839 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1932,7 +1932,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix # if word embeddings are not tied, make sure that lm head is resized as well if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: old_lm_head = self.get_output_embeddings() - new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens) + if isinstance(old_lm_head, torch.nn.Embedding): + new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens) + else: + new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens) if hasattr(old_lm_head, "_hf_hook"): hook = old_lm_head._hf_hook add_hook_to_module(new_lm_head, hook) diff --git a/tests/deepspeed/test_model_zoo.py b/tests/deepspeed/test_model_zoo.py index 08c8b86dc0..ea002f5ddf 100644 --- a/tests/deepspeed/test_model_zoo.py +++ b/tests/deepspeed/test_model_zoo.py @@ -236,6 +236,8 @@ def make_task_cmds(): --train_file {data_dir_wmt}/train.json --source_lang en --target_lang ro + --max_source_length 12 + --max_target_length 12 """, "sum": f""" {scripts_dir}/summarization/run_summarization.py @@ -269,6 +271,7 @@ def make_task_cmds(): --remove_unused_columns False --max_steps 10 --image_processor_name {DS_TESTS_DIRECTORY}/vit_feature_extractor.json + --label_column_name labels """, } From af4c02622bfb4521367c459c6743014ef9be788d Mon Sep 17 00:00:00 2001 From: Matthew Hoffman Date: Tue, 9 Apr 2024 03:26:15 -0500 Subject: [PATCH 055/110] Add datasets.Dataset to Trainer's train_dataset and eval_dataset type hints (#30077) * Add datasets.Dataset to Trainer's train_dataset and eval_dataset type hints * Add is_datasets_available check for importing datasets under TYPE_CHECKING guard https://github.com/huggingface/transformers/pull/30077/files#r1555939352 --- src/transformers/trainer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 227e92fa63..1cdd8623e5 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -250,6 +250,8 @@ def _get_fsdp_ckpt_kwargs(): if TYPE_CHECKING: import optuna + if is_datasets_available(): + import datasets logger = logging.get_logger(__name__) @@ -287,7 +289,7 @@ class Trainer: The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will default to [`default_data_collator`] if no `tokenizer` is provided, an instance of [`DataCollatorWithPadding`] otherwise. - train_dataset (`torch.utils.data.Dataset` or `torch.utils.data.IterableDataset`, *optional*): + train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. @@ -296,7 +298,7 @@ class Trainer: `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally sets the seed of the RNGs used. - eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`]), *optional*): + eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`, `datasets.Dataset`]), *optional*): The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each dataset prepending the dictionary key to the metric name. @@ -358,8 +360,8 @@ class Trainer: model: Union[PreTrainedModel, nn.Module] = None, args: TrainingArguments = None, data_collator: Optional[DataCollator] = None, - train_dataset: Optional[Union[Dataset, IterableDataset]] = None, - eval_dataset: Optional[Union[Dataset, Dict[str, Dataset]]] = None, + train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, + eval_dataset: Optional[Union[Dataset, Dict[str, Dataset], "datasets.Dataset"]] = None, tokenizer: Optional[PreTrainedTokenizerBase] = None, model_init: Optional[Callable[[], PreTrainedModel]] = None, compute_metrics: Optional[Callable[[EvalPrediction], Dict]] = None, From 841e87ef4f9ab52a2c5db62789bb625291be7f11 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Tue, 9 Apr 2024 14:58:02 +0500 Subject: [PATCH 056/110] Fix docs Pop2Piano (#30140) fix copies --- .../models/pop2piano/tokenization_pop2piano.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/transformers/models/pop2piano/tokenization_pop2piano.py b/src/transformers/models/pop2piano/tokenization_pop2piano.py index 3c5844ae7c..5ad0996c15 100644 --- a/src/transformers/models/pop2piano/tokenization_pop2piano.py +++ b/src/transformers/models/pop2piano/tokenization_pop2piano.py @@ -73,6 +73,16 @@ class Pop2PianoTokenizer(PreTrainedTokenizer): Determines the default velocity to be used while creating midi Notes. num_bars (`int`, *optional*, defaults to 2): Determines cutoff_time_idx in for each token. + unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"-1"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to 1): + The end of sequence token. + pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to 0): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to 2): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. """ model_input_names = ["token_ids", "attention_mask"] From ec59a4219254e0c7efe3ec9001c7a05efdeec010 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 9 Apr 2024 11:04:18 +0100 Subject: [PATCH 057/110] Revert workaround for TF safetensors loading (#30128) * See if we can get tests to pass with the fixed weights * See if we can get tests to pass with the fixed weights * Replace the revisions now that we don't need them anymore --- tests/generation/test_framework_agnostic.py | 8 ++++---- tests/generation/test_tf_utils.py | 2 +- tests/pipelines/test_pipelines_text_generation.py | 1 - 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/generation/test_framework_agnostic.py b/tests/generation/test_framework_agnostic.py index 85a58bdf28..f4f13dd8d5 100644 --- a/tests/generation/test_framework_agnostic.py +++ b/tests/generation/test_framework_agnostic.py @@ -111,7 +111,7 @@ class GenerationIntegrationTestsMixin: article = """Justin Timberlake.""" gpt2_tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") - gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) + gpt2_model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") input_ids = gpt2_tokenizer(article, return_tensors=return_tensors).input_ids if is_pt: gpt2_model = gpt2_model.to(torch_device) @@ -582,7 +582,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) @@ -611,7 +611,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) @@ -638,7 +638,7 @@ class GenerationIntegrationTestsMixin: tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors=return_tensors) - model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=is_pt) + model = model_cls.from_pretrained("hf-internal-testing/tiny-random-gpt2") if is_pt: model = model.to(torch_device) tokens = tokens.to(torch_device) diff --git a/tests/generation/test_tf_utils.py b/tests/generation/test_tf_utils.py index 73eba05ece..f40ceebef7 100644 --- a/tests/generation/test_tf_utils.py +++ b/tests/generation/test_tf_utils.py @@ -194,7 +194,7 @@ class TFGenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTests tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") text = """Hello, my dog is cute and""" tokens = tokenizer(text, return_tensors="tf") - model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2", use_safetensors=False) + model = TFAutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2") eos_token_id = 638 # forces the generation to happen on CPU, to avoid GPU-related quirks diff --git a/tests/pipelines/test_pipelines_text_generation.py b/tests/pipelines/test_pipelines_text_generation.py index 318526b850..ada04c7dbe 100644 --- a/tests/pipelines/test_pipelines_text_generation.py +++ b/tests/pipelines/test_pipelines_text_generation.py @@ -268,7 +268,6 @@ class TextGenerationPipelineTests(unittest.TestCase): text_generator = TextGenerationPipeline(model=model, tokenizer=tokenizer) return text_generator, ["This is a test", "Another test"] - @require_torch # See https://github.com/huggingface/transformers/issues/30117 def test_stop_sequence_stopping_criteria(self): prompt = """Hello I believe in""" text_generator = pipeline("text-generation", model="hf-internal-testing/tiny-random-gpt2") From ba1b24e07bebc8e36b464bf7a403feb4f3ccb807 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:52:50 +0200 Subject: [PATCH 058/110] [Trainer] Fix default data collator (#30142) * Fix data collator * Support feature extractors as well --- src/transformers/trainer.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 1cdd8623e5..844e464c6e 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -58,6 +58,7 @@ from . import __version__ from .configuration_utils import PretrainedConfig from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator from .debug_utils import DebugOption, DebugUnderflowOverflow +from .feature_extraction_sequence_utils import SequenceFeatureExtractor from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend from .image_processing_utils import BaseImageProcessor from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available @@ -492,7 +493,11 @@ class Trainer: ): self.place_model_on_device = False - default_collator = DataCollatorWithPadding(tokenizer) if tokenizer is not None else default_data_collator + default_collator = ( + DataCollatorWithPadding(tokenizer) + if tokenizer is not None and isinstance(tokenizer, (PreTrainedTokenizerBase, SequenceFeatureExtractor)) + else default_data_collator + ) self.data_collator = data_collator if data_collator is not None else default_collator self.train_dataset = train_dataset self.eval_dataset = eval_dataset From e9c23fa056f401a586a1691edf773d1b9b60f96d Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Tue, 9 Apr 2024 12:55:42 +0200 Subject: [PATCH 059/110] [Trainer] Undo #29896 (#30129) * Undo * Use tokenizer * Undo data collator --- docs/source/en/tasks/image_classification.md | 4 ++-- docs/source/en/tasks/object_detection.md | 2 +- docs/source/en/tasks/semantic_segmentation.md | 2 +- docs/source/en/tasks/video_classification.md | 2 +- docs/source/es/tasks/image_classification.md | 2 +- docs/source/ja/tasks/image_classification.md | 4 ++-- docs/source/ja/tasks/object_detection.md | 2 +- docs/source/ja/tasks/semantic_segmentation.md | 2 +- docs/source/ja/tasks/video_classification.md | 2 +- docs/source/ko/tasks/image_classification.md | 4 ++-- docs/source/ko/tasks/object_detection.md | 2 +- docs/source/ko/tasks/semantic_segmentation.md | 2 +- docs/source/ko/tasks/video_classification.md | 2 +- .../run_image_classification.py | 2 +- examples/pytorch/image-pretraining/run_mae.py | 2 +- examples/pytorch/image-pretraining/run_mim.py | 2 +- .../run_semantic_segmentation.py | 2 +- .../run_image_classification.py | 2 +- src/transformers/trainer.py | 17 ++--------------- src/transformers/trainer_callback.py | 6 +----- 20 files changed, 24 insertions(+), 41 deletions(-) diff --git a/docs/source/en/tasks/image_classification.md b/docs/source/en/tasks/image_classification.md index f54b4ed025..30c517f3be 100644 --- a/docs/source/en/tasks/image_classification.md +++ b/docs/source/en/tasks/image_classification.md @@ -322,7 +322,7 @@ At this point, only three steps remain: ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -418,7 +418,7 @@ and use the [PushToHubCallback](../main_classes/keras_callbacks#transformers.Pus >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... image_processor=image_processor, +... tokenizer=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/en/tasks/object_detection.md b/docs/source/en/tasks/object_detection.md index 56d46e4aa5..2513591f54 100644 --- a/docs/source/en/tasks/object_detection.md +++ b/docs/source/en/tasks/object_detection.md @@ -384,7 +384,7 @@ Finally, bring everything together, and call [`~transformers.Trainer.train`]: ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... image_processor=image_processor, +... tokenizer=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index ba40ccba1e..e99499bbbb 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -642,7 +642,7 @@ and use the [`PushToHubCallback`] to upload the model: ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/en/tasks/video_classification.md b/docs/source/en/tasks/video_classification.md index a0f0a695f7..38bdceba41 100644 --- a/docs/source/en/tasks/video_classification.md +++ b/docs/source/en/tasks/video_classification.md @@ -407,7 +407,7 @@ Then you just pass all of this along with the datasets to `Trainer`: ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/docs/source/es/tasks/image_classification.md b/docs/source/es/tasks/image_classification.md index 4a572d8169..f09730caf6 100644 --- a/docs/source/es/tasks/image_classification.md +++ b/docs/source/es/tasks/image_classification.md @@ -160,7 +160,7 @@ Al llegar a este punto, solo quedan tres pasos: ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... image_processor=image_processor, +... tokenizer=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ja/tasks/image_classification.md b/docs/source/ja/tasks/image_classification.md index fc57cf4dfb..f8d8d0d552 100644 --- a/docs/source/ja/tasks/image_classification.md +++ b/docs/source/ja/tasks/image_classification.md @@ -328,7 +328,7 @@ food["test"].set_transform(preprocess_val) ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -426,7 +426,7 @@ Convert your datasets to the `tf.data.Dataset` format using the [`~datasets.Data >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... image_processor=image_processor, +... tokenizer=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/ja/tasks/object_detection.md b/docs/source/ja/tasks/object_detection.md index e90cb4645a..389e7bdf2f 100644 --- a/docs/source/ja/tasks/object_detection.md +++ b/docs/source/ja/tasks/object_detection.md @@ -376,7 +376,7 @@ DETR モデルをトレーニングできる「ラベル」。画像プロセッ ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... image_processor=image_processor, +... tokenizer=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ja/tasks/semantic_segmentation.md b/docs/source/ja/tasks/semantic_segmentation.md index bc4c8fdc10..2816688b4e 100644 --- a/docs/source/ja/tasks/semantic_segmentation.md +++ b/docs/source/ja/tasks/semantic_segmentation.md @@ -434,7 +434,7 @@ TensorFlow でモデルを微調整するには、次の手順に従います。 ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/ja/tasks/video_classification.md b/docs/source/ja/tasks/video_classification.md index b0b5139028..e0c3836194 100644 --- a/docs/source/ja/tasks/video_classification.md +++ b/docs/source/ja/tasks/video_classification.md @@ -414,7 +414,7 @@ def compute_metrics(eval_pred): ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/docs/source/ko/tasks/image_classification.md b/docs/source/ko/tasks/image_classification.md index 055100d4c0..031e01ea5c 100644 --- a/docs/source/ko/tasks/image_classification.md +++ b/docs/source/ko/tasks/image_classification.md @@ -321,7 +321,7 @@ food["test"].set_transform(preprocess_val) ... data_collator=data_collator, ... train_dataset=food["train"], ... eval_dataset=food["test"], -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... ) @@ -417,7 +417,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요: >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_eval_dataset) >>> push_to_hub_callback = PushToHubCallback( ... output_dir="food_classifier", -... image_processor=image_processor, +... tokenizer=image_processor, ... save_strategy="no", ... ) >>> callbacks = [metric_callback, push_to_hub_callback] diff --git a/docs/source/ko/tasks/object_detection.md b/docs/source/ko/tasks/object_detection.md index 1eeada9a50..0076bba6f8 100644 --- a/docs/source/ko/tasks/object_detection.md +++ b/docs/source/ko/tasks/object_detection.md @@ -366,7 +366,7 @@ DatasetDict({ ... args=training_args, ... data_collator=collate_fn, ... train_dataset=cppe5["train"], -... image_processor=image_processor, +... tokenizer=image_processor, ... ) >>> trainer.train() diff --git a/docs/source/ko/tasks/semantic_segmentation.md b/docs/source/ko/tasks/semantic_segmentation.md index 4c23b2ad80..4b6109d692 100644 --- a/docs/source/ko/tasks/semantic_segmentation.md +++ b/docs/source/ko/tasks/semantic_segmentation.md @@ -424,7 +424,7 @@ TensorFlow에서 모델을 미세 조정하려면 다음 단계를 따르세요: ... metric_fn=compute_metrics, eval_dataset=tf_eval_dataset, batch_size=batch_size, label_cols=["labels"] ... ) ->>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", image_processor=image_processor) +>>> push_to_hub_callback = PushToHubCallback(output_dir="scene_segmentation", tokenizer=image_processor) >>> callbacks = [metric_callback, push_to_hub_callback] ``` diff --git a/docs/source/ko/tasks/video_classification.md b/docs/source/ko/tasks/video_classification.md index 4d13f9ac61..01dbb0757b 100644 --- a/docs/source/ko/tasks/video_classification.md +++ b/docs/source/ko/tasks/video_classification.md @@ -411,7 +411,7 @@ def compute_metrics(eval_pred): ... args, ... train_dataset=train_dataset, ... eval_dataset=val_dataset, -... image_processor=image_processor, +... tokenizer=image_processor, ... compute_metrics=compute_metrics, ... data_collator=collate_fn, ... ) diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py index 1c952e5601..ff01600cb3 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py @@ -411,7 +411,7 @@ def main(): train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, - image_processor=image_processor, + tokenizer=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/image-pretraining/run_mae.py b/examples/pytorch/image-pretraining/run_mae.py index 0f098caf02..a23e41df61 100644 --- a/examples/pytorch/image-pretraining/run_mae.py +++ b/examples/pytorch/image-pretraining/run_mae.py @@ -369,7 +369,7 @@ def main(): args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, - image_processor=image_processor, + tokenizer=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/image-pretraining/run_mim.py b/examples/pytorch/image-pretraining/run_mim.py index e1afeece12..625a96f14e 100644 --- a/examples/pytorch/image-pretraining/run_mim.py +++ b/examples/pytorch/image-pretraining/run_mim.py @@ -458,7 +458,7 @@ def main(): args=training_args, train_dataset=ds["train"] if training_args.do_train else None, eval_dataset=ds["validation"] if training_args.do_eval else None, - image_processor=image_processor, + tokenizer=image_processor, data_collator=collate_fn, ) diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 8324531ccb..957b78b9b5 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -510,7 +510,7 @@ def main(): train_dataset=dataset["train"] if training_args.do_train else None, eval_dataset=dataset["validation"] if training_args.do_eval else None, compute_metrics=compute_metrics, - image_processor=image_processor, + tokenizer=image_processor, data_collator=default_data_collator, ) diff --git a/examples/tensorflow/image-classification/run_image_classification.py b/examples/tensorflow/image-classification/run_image_classification.py index ab2de73a3b..3e2b43bca1 100644 --- a/examples/tensorflow/image-classification/run_image_classification.py +++ b/examples/tensorflow/image-classification/run_image_classification.py @@ -552,7 +552,7 @@ def main(): output_dir=training_args.output_dir, hub_model_id=push_to_hub_model_id, hub_token=training_args.push_to_hub_token, - image_processor=image_processor, + tokenizer=image_processor, **model_card_kwargs, ) ) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 844e464c6e..f6e80ebafe 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -60,7 +60,6 @@ from .data.data_collator import DataCollator, DataCollatorWithPadding, default_d from .debug_utils import DebugOption, DebugUnderflowOverflow from .feature_extraction_sequence_utils import SequenceFeatureExtractor from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend -from .image_processing_utils import BaseImageProcessor from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available from .integrations.tpu import tpu_spmd_dataloader from .modelcard import TrainingSummary @@ -331,9 +330,6 @@ class Trainer: by this function will be reflected in the predictions received by `compute_metrics`. Note that the labels (second parameter) will be `None` if the dataset does not have them. - image_processor ([`BaseImageProcessor`], *optional*): - The image processor used to preprocess the data. If provided, it will be saved along the model to make it easier - to rerun an interrupted training or reuse the fine-tuned model. Important attributes: @@ -369,7 +365,6 @@ class Trainer: callbacks: Optional[List[TrainerCallback]] = None, optimizers: Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, - image_processor: Optional["BaseImageProcessor"] = None, ): if args is None: output_dir = "tmp_trainer" @@ -502,7 +497,6 @@ class Trainer: self.train_dataset = train_dataset self.eval_dataset = eval_dataset self.tokenizer = tokenizer - self.image_processor = image_processor # Bnb Quantized models doesn't support `.to` operation. if ( @@ -554,7 +548,7 @@ class Trainer: default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks self.callback_handler = CallbackHandler( - callbacks, self.model, self.tokenizer, self.image_processor, self.optimizer, self.lr_scheduler + callbacks, self.model, self.tokenizer, self.optimizer, self.lr_scheduler ) self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) @@ -3289,8 +3283,6 @@ class Trainer: ) if self.tokenizer is not None and self.args.should_save: self.tokenizer.save_pretrained(output_dir) - if self.image_processor is not None and self.args.should_save: - self.image_processor.save_pretrained(output_dir) # We moved the model from TPU -> CPU for saving the weights. # Now we should move it back to subsequent compute still works. @@ -3328,8 +3320,6 @@ class Trainer: if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) - if self.image_processor is not None: - self.image_processor.save_pretrained(output_dir) # Good practice: save your training arguments together with the trained model torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) @@ -4027,9 +4017,6 @@ class Trainer: # Saving the tokenizer is fast and we don't know how many files it may have spawned, so we resave it to be sure. if self.tokenizer is not None: self.tokenizer.save_pretrained(output_dir) - # Same for the image processor - if self.image_processor is not None: - self.image_processor.save_pretrained(output_dir) # Same for the training arguments torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) @@ -4083,7 +4070,7 @@ class Trainer: **kwargs, ) -> str: """ - Upload `self.model` and `self.tokenizer` or `self.image_processor` to the 🤗 model hub on the repo `self.args.hub_model_id`. + Upload `self.model` and `self.tokenizer` to the 🤗 model hub on the repo `self.args.hub_model_id`. Parameters: commit_message (`str`, *optional*, defaults to `"End of training"`): diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index a9cb6eca59..1e3b0e587a 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -189,8 +189,6 @@ class TrainerCallback: The model being trained. tokenizer ([`PreTrainedTokenizer`]): The tokenizer used for encoding the data. - image_processor ([`BaseImageProcessor`]): - The image processor used for encoding the images. optimizer (`torch.optim.Optimizer`): The optimizer used for the training steps. lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`): @@ -309,13 +307,12 @@ class TrainerCallback: class CallbackHandler(TrainerCallback): """Internal class that just calls the list of callbacks in order.""" - def __init__(self, callbacks, model, tokenizer, image_processor, optimizer, lr_scheduler): + def __init__(self, callbacks, model, tokenizer, optimizer, lr_scheduler): self.callbacks = [] for cb in callbacks: self.add_callback(cb) self.model = model self.tokenizer = tokenizer - self.image_processor = image_processor self.optimizer = optimizer self.lr_scheduler = lr_scheduler self.train_dataloader = None @@ -420,7 +417,6 @@ class CallbackHandler(TrainerCallback): control, model=self.model, tokenizer=self.tokenizer, - image_processor=self.image_processor, optimizer=self.optimizer, lr_scheduler=self.lr_scheduler, train_dataloader=self.train_dataloader, From 08a194fcd615dcf9406a7e319d637cc303097f46 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 9 Apr 2024 13:28:54 +0200 Subject: [PATCH 060/110] Fix slow tests for important models to be compatible with A10 runners (#29905) * fix mistral and mixtral * add pdb * fix mixtral tesst * fix * fix mistral ? * add fix gemma * fix mistral * fix * test * anoter test * fix * fix * fix mistral tests * fix them again * final fixes for mistral * fix padding right * fix whipser fa2 * fix * fix * fix gemma * test * fix llama * fix * fix * fix llama gemma * add class attribute * fix CI * clarify whisper * compute_capability * rename names in some comments * Add # fmt: skip * make style * Update tests/models/mistral/test_modeling_mistral.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update * update --------- Co-authored-by: Younes Belkada Co-authored-by: ydshieh Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- tests/models/gemma/test_modeling_gemma.py | 141 ++++++++++-------- tests/models/llama/test_modeling_llama.py | 34 ++++- tests/models/mistral/test_modeling_mistral.py | 60 ++++++-- tests/models/mixtral/test_modeling_mixtral.py | 85 ++++++++--- tests/models/whisper/test_modeling_whisper.py | 18 +++ tests/test_modeling_common.py | 18 ++- 6 files changed, 246 insertions(+), 110 deletions(-) diff --git a/tests/models/gemma/test_modeling_gemma.py b/tests/models/gemma/test_modeling_gemma.py index 8c3aa392ba..e70dab3d95 100644 --- a/tests/models/gemma/test_modeling_gemma.py +++ b/tests/models/gemma/test_modeling_gemma.py @@ -21,6 +21,7 @@ from parameterized import parameterized from transformers import AutoModelForCausalLM, AutoTokenizer, GemmaConfig, is_torch_available from transformers.testing_utils import ( + is_flaky, require_bitsandbytes, require_flash_attn, require_read_token, @@ -379,40 +380,6 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi def test_past_key_values_format(self): pass - @require_flash_attn - @require_torch_gpu - @pytest.mark.flash_attn_test - @slow - def test_flash_attn_2_generate_padding_right(self): - import torch - - for model_class in self.all_generative_model_classes: - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to( - torch_device - ) - - dummy_input = torch.LongTensor([[0, 2, 3, 4], [0, 2, 3, 4]]).to(torch_device) - dummy_attention_mask = torch.LongTensor([[1, 1, 1, 1], [1, 1, 1, 0]]).to(torch_device) - - model.generate(dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False) - - model = model_class.from_pretrained( - tmpdirname, - torch_dtype=torch.float16, - attn_implementation="flash_attention_2", - low_cpu_mem_usage=True, - ).to(torch_device) - - with self.assertRaises(ValueError): - _ = model.generate( - dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=1, do_sample=False - ) - @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test @@ -500,6 +467,7 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi @require_flash_attn @require_torch_gpu @pytest.mark.flash_attn_test + @is_flaky @slow def test_flash_attn_2_equivalence(self): for model_class in self.all_model_classes: @@ -531,12 +499,21 @@ class GemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi assert torch.allclose(logits_fa, logits, atol=3e-3) -@require_torch_gpu @slow -@require_read_token +@require_torch_gpu class GemmaIntegrationTest(unittest.TestCase): input_text = ["Hello I am doing", "Hi today"] + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + @require_read_token def test_model_2b_fp32(self): model_id = "google/gemma-2b" EXPECTED_TEXTS = [ @@ -554,6 +531,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) + @require_read_token def test_model_2b_fp16(self): model_id = "google/gemma-2b" EXPECTED_TEXTS = [ @@ -573,6 +551,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) + @require_read_token def test_model_2b_fp16_static_cache(self): model_id = "google/gemma-2b" EXPECTED_TEXTS = [ @@ -594,12 +573,19 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) + @require_read_token def test_model_2b_bf16(self): model_id = "google/gemma-2b" - EXPECTED_TEXTS = [ - "Hello I am doing a project on the 1990s and I need to know what the most popular music", - "Hi today I am going to share with you a very easy and simple recipe of Khichdi", - ] + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( torch_device @@ -611,14 +597,21 @@ class GemmaIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + @require_read_token def test_model_2b_eager(self): model_id = "google/gemma-2b" - EXPECTED_TEXTS = [ - "Hello I am doing a project on the 1990s and I am looking for some information on the ", - "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", - ] + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I am looking for some information on the ", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } model = AutoModelForCausalLM.from_pretrained( model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="eager" @@ -631,15 +624,22 @@ class GemmaIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) @require_torch_sdpa + @require_read_token def test_model_2b_sdpa(self): model_id = "google/gemma-2b" - EXPECTED_TEXTS = [ - "Hello I am doing a project on the 1990s and I need to know what the most popular music", - "Hi today I am going to share with you a very easy and simple recipe of Khichdi", - ] + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Khichdi", + ], + 8: [ + "Hello I am doing a project on the 1990s and I need to know what the most popular music", + "Hi today I am going to share with you a very easy and simple recipe of Kaju Kat", + ], + } model = AutoModelForCausalLM.from_pretrained( model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, attn_implementation="sdpa" @@ -652,10 +652,11 @@ class GemmaIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) @pytest.mark.flash_attn_test @require_flash_attn + @require_read_token def test_model_2b_flash_attn(self): model_id = "google/gemma-2b" EXPECTED_TEXTS = [ @@ -677,6 +678,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) @require_bitsandbytes + @require_read_token def test_model_2b_4bit(self): model_id = "google/gemma-2b" EXPECTED_TEXTS = [ @@ -695,6 +697,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) @unittest.skip("The test will not fit our CI runners") + @require_read_token def test_model_7b_fp32(self): model_id = "google/gemma-7b" EXPECTED_TEXTS = [ @@ -712,6 +715,7 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) + @require_read_token def test_model_7b_fp16(self): model_id = "google/gemma-7b" EXPECTED_TEXTS = [ @@ -731,12 +735,19 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) + @require_read_token def test_model_7b_bf16(self): model_id = "google/gemma-7b" - EXPECTED_TEXTS = [ - """Hello I am doing a project on a 1991 240sx and I am trying to find""", - "Hi today I am going to show you how to make a very simple and easy to make a very simple and", - ] + EXPECTED_TEXTS = { + 7: [ + """Hello I am doing a project on a 1991 240sx and I am trying to find""", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + 8: [ + "Hello I am doing a project for my school and I am trying to make a program that will read a .txt file", + "Hi today I am going to show you how to make a very simple and easy to make a very simple and", + ], + } model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, torch_dtype=torch.bfloat16).to( torch_device @@ -748,8 +759,9 @@ class GemmaIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) + @require_read_token def test_model_7b_fp16_static_cache(self): model_id = "google/gemma-7b" EXPECTED_TEXTS = [ @@ -772,12 +784,19 @@ class GemmaIntegrationTest(unittest.TestCase): self.assertEqual(output_text, EXPECTED_TEXTS) @require_bitsandbytes + @require_read_token def test_model_7b_4bit(self): model_id = "google/gemma-7b" - EXPECTED_TEXTS = [ - "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", - """Hi today I am going to talk about the new update for the game called "The new update" and I""", - ] + EXPECTED_TEXTS = { + 7: [ + "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", + """Hi today I am going to talk about the new update for the game called "The new update" and I""", + ], + 8: [ + "Hello I am doing a project for my school and I am trying to make a program that will take a number and then", + "Hi today I am going to talk about the best way to get rid of acne. miniaturing is a very", + ], + } model = AutoModelForCausalLM.from_pretrained(model_id, low_cpu_mem_usage=True, load_in_4bit=True) @@ -787,4 +806,4 @@ class GemmaIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20, do_sample=False) output_text = tokenizer.batch_decode(output, skip_special_tokens=True) - self.assertEqual(output_text, EXPECTED_TEXTS) + self.assertEqual(output_text, EXPECTED_TEXTS[self.cuda_compute_capability_major_version]) diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 0fb4087dba..dc24fd848c 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -597,8 +597,18 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi pass -@require_torch +@require_torch_gpu class LlamaIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + @unittest.skip("Logits are not exactly the same, once we fix the instabalities somehow, will update!") @slow def test_model_7b_logits(self): @@ -675,16 +685,25 @@ class LlamaIntegrationTest(unittest.TestCase): @require_read_token def test_compile_static_cache(self): NUM_TOKENS_TO_GENERATE = 40 - EXPECTED_TEXT_COMPLETION = [ - "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.", - "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", - ] + EXPECTED_TEXT_COMPLETION = { + 7: [ + "Simply put, the theory of relativity states that 1) the speed of light is constant, 2) the speed of light is the same for all observers, and 3) the laws of physics are the same for all observers.", + "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", + ], + 8: [ + "Simply put, the theory of relativity states that 1) the speed of light is the same for all observers, and 2) the laws of physics are the same for all observers.\nThe first part of the theory of relativity", + "My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p", + ], + } + prompts = [ "Simply put, the theory of relativity states that ", "My favorite all time favorite condiment is ketchup.", ] tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf", pad_token="", padding_side="right") - model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", device_map="sequential") + model = LlamaForCausalLM.from_pretrained( + "meta-llama/Llama-2-7b-hf", device_map="sequential", torch_dtype=torch.float16 + ) inputs = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device) def decode_one_tokens(model, cur_token, input_pos, cache_position): @@ -718,7 +737,7 @@ class LlamaIntegrationTest(unittest.TestCase): cache_position += 1 text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) - self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text) @require_torch @@ -763,6 +782,7 @@ end @require_torch_accelerator @slow + @unittest.skip("Model is too large") def test_model_7b_logits(self): model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(torch_device) tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") diff --git a/tests/models/mistral/test_modeling_mistral.py b/tests/models/mistral/test_modeling_mistral.py index 432097e9d1..59f3cdea69 100644 --- a/tests/models/mistral/test_modeling_mistral.py +++ b/tests/models/mistral/test_modeling_mistral.py @@ -470,39 +470,68 @@ class MistralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi self.skipTest("Mistral flash attention does not support right padding") -@require_torch +@require_torch_gpu class MistralIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + + def tearDown(self): + torch.cuda.empty_cache() + gc.collect() + @slow def test_model_7b_logits(self): input_ids = [1, 306, 4658, 278, 6593, 310, 2834, 338] - model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto") + model = MistralForCausalLM.from_pretrained( + "mistralai/Mistral-7B-v0.1", device_map="auto", torch_dtype=torch.float16 + ) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) with torch.no_grad(): out = model(input_ids).logits.cpu() # Expected mean on dim = -1 EXPECTED_MEAN = torch.tensor([[-2.5548, -2.5737, -3.0600, -2.5906, -2.8478, -2.8118, -2.9325, -2.7694]]) torch.testing.assert_close(out.mean(-1), EXPECTED_MEAN, atol=1e-2, rtol=1e-2) - # slicing logits[0, 0, 0:30] - EXPECTED_SLICE = torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]) # fmt: skip + + EXPECTED_SLICE = { + 7: torch.tensor([-5.8781, -5.8616, -0.1052, -4.7200, -5.8781, -5.8774, -5.8773, -5.8777, -5.8781, -5.8780, -5.8781, -5.8779, -1.0787, 1.7583, -5.8779, -5.8780, -5.8783, -5.8778, -5.8776, -5.8781, -5.8784, -5.8778, -5.8778, -5.8777, -5.8779, -5.8778, -5.8776, -5.8780, -5.8779, -5.8781]), + 8: torch.tensor([-5.8711, -5.8555, -0.1050, -4.7148, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -1.0781, 1.7568, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711, -5.8711]), + } # fmt: skip + print(out[0, 0, :30]) - torch.testing.assert_close(out[0, 0, :30], EXPECTED_SLICE, atol=1e-4, rtol=1e-4) + torch.testing.assert_close( + out[0, 0, :30], EXPECTED_SLICE[self.cuda_compute_capability_major_version], atol=1e-4, rtol=1e-4 + ) del model backend_empty_cache(torch_device) gc.collect() @slow + @require_bitsandbytes def test_model_7b_generation(self): - EXPECTED_TEXT_COMPLETION = """My favourite condiment is 100% ketchup. I love it on everything. I’m not a big""" + EXPECTED_TEXT_COMPLETION = { + 7: "My favourite condiment is 100% ketchup. I love it on everything. I'm not a big", + 8: "My favourite condiment is 100% ketchup. I’m not a fan of mustard, mayo,", + } + prompt = "My favourite condiment is " tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False) - model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", device_map="auto") + model = MistralForCausalLM.from_pretrained( + "mistralai/Mistral-7B-v0.1", device_map={"": torch_device}, load_in_4bit=True + ) input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.model.embed_tokens.weight.device) # greedy generation outputs generated_ids = model.generate(input_ids, max_new_tokens=20, temperature=0) text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text) del model backend_empty_cache(torch_device) @@ -517,7 +546,7 @@ class MistralIntegrationTest(unittest.TestCase): input_ids = [1] + [306, 338] * 2048 model = MistralForCausalLM.from_pretrained( "mistralai/Mistral-7B-v0.1", - device_map="auto", + device_map={"": torch_device}, load_in_4bit=True, attn_implementation="flash_attention_2", ) @@ -544,9 +573,7 @@ class MistralIntegrationTest(unittest.TestCase): # An input with 4097 tokens that is above the size of the sliding window input_ids = [1] + [306, 338] * 2048 model = MistralForCausalLM.from_pretrained( - "mistralai/Mistral-7B-v0.1", - device_map="auto", - attn_implementation="sdpa", + "mistralai/Mistral-7B-v0.1", device_map="auto", attn_implementation="sdpa", torch_dtype=torch.float16 ) input_ids = torch.tensor([input_ids]).to(model.model.embed_tokens.weight.device) generated_ids = model.generate(input_ids, max_new_tokens=4, temperature=0) @@ -577,9 +604,10 @@ class MistralIntegrationTest(unittest.TestCase): @slow def test_speculative_generation(self): - EXPECTED_TEXT_COMPLETION = ( - "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs" - ) + EXPECTED_TEXT_COMPLETION = { + 7: "My favourite condiment is 100% Sriracha. I love the heat, the tang and the fact costs", + 8: "My favourite condiment is 100% Sriracha. I love the heat, the sweetness, the tang", + } prompt = "My favourite condiment is " tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", use_fast=False) model = MistralForCausalLM.from_pretrained( @@ -593,7 +621,7 @@ class MistralIntegrationTest(unittest.TestCase): input_ids, max_new_tokens=20, do_sample=True, temperature=0.3, assistant_model=model ) text = tokenizer.decode(generated_ids[0], skip_special_tokens=True) - self.assertEqual(EXPECTED_TEXT_COMPLETION, text) + self.assertEqual(EXPECTED_TEXT_COMPLETION[self.cuda_compute_capability_major_version], text) del model backend_empty_cache(torch_device) diff --git a/tests/models/mixtral/test_modeling_mixtral.py b/tests/models/mixtral/test_modeling_mixtral.py index 98654c5133..0cc8c9fc44 100644 --- a/tests/models/mixtral/test_modeling_mixtral.py +++ b/tests/models/mixtral/test_modeling_mixtral.py @@ -507,6 +507,16 @@ class MixtralModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi @require_torch class MixtralIntegrationTest(unittest.TestCase): + # This variable is used to determine which CUDA device are we using for our runners (A10 or T4) + # Depending on the hardware we get different logits / generations + cuda_compute_capability_major_version = None + + @classmethod + def setUpClass(cls): + if is_torch_available() and torch.cuda.is_available(): + # 8 is for A100 / A10 and 7 for T4 + cls.cuda_compute_capability_major_version = torch.cuda.get_device_capability()[0] + @slow @require_torch_gpu def test_small_model_logits(self): @@ -518,18 +528,26 @@ class MixtralIntegrationTest(unittest.TestCase): ) # TODO: might need to tweak it in case the logits do not match on our daily runners # these logits have been obtained with the original megablocks impelmentation. - EXPECTED_LOGITS = torch.Tensor( - [[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]] - ).to(torch_device) - + EXPECTED_LOGITS = { + 7: torch.Tensor([[0.1670, 0.1620, 0.6094], [-0.8906, -0.1588, -0.6060], [0.1572, 0.1290, 0.7246]]).to( + torch_device + ), + 8: torch.Tensor([[0.1631, 0.1621, 0.6094], [-0.8906, -0.1621, -0.6094], [0.1572, 0.1270, 0.7227]]).to( + torch_device + ), + } with torch.no_grad(): logits = model(dummy_input).logits - torch.testing.assert_close(logits[0, :3, :3].half(), EXPECTED_LOGITS, atol=1e-3, rtol=1e-3) - torch.testing.assert_close(logits[1, :3, :3].half(), EXPECTED_LOGITS, atol=1e-3, rtol=1e-3) + torch.testing.assert_close( + logits[0, :3, :3], EXPECTED_LOGITS[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3 + ) + torch.testing.assert_close( + logits[1, :3, :3], EXPECTED_LOGITS[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3 + ) @slow - # @require_torch_gpu + @require_torch_gpu def test_small_model_logits_batched(self): model_id = "hf-internal-testing/Mixtral-tiny" dummy_input = torch.LongTensor([[0, 0, 0, 0, 0, 0, 1, 2, 3], [1, 1, 2, 3, 4, 5, 6, 7, 8]]).to(torch_device) @@ -540,23 +558,48 @@ class MixtralIntegrationTest(unittest.TestCase): ) # TODO: might need to tweak it in case the logits do not match on our daily runners - EXPECTED_LOGITS_LEFT = torch.Tensor( - [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]], - ) + EXPECTED_LOGITS_LEFT = { + 7: torch.Tensor( + [[0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007], [0.1750, 0.0537, 0.7007]], + ).to(torch_device), + 8: torch.Tensor([[0.1914, 0.0508, 0.7188], [0.1953, 0.0510, 0.7227], [0.1973, 0.0562, 0.7148]]).to( + torch_device + ), + } - # logits[0, -3:, -3:].half() - EXPECTED_LOGITS_LEFT_UNPADDED = torch.Tensor( - [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]], - ) + EXPECTED_LOGITS_LEFT_UNPADDED = { + 7: torch.Tensor( + [[0.2212, 0.5200, -0.3816], [0.8213, -0.2313, 0.6069], [0.2664, -0.7090, 0.2468]], + ).to(torch_device), + 8: torch.Tensor([[0.2217, 0.5195, -0.3828], [0.8203, -0.2295, 0.6055], [0.2676, -0.7109, 0.2461]]).to( + torch_device + ), + } - # logits[1, -3:, -3:].half() - EXPECTED_LOGITS_RIGHT_UNPADDED = torch.Tensor( - [[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]] - ) + EXPECTED_LOGITS_RIGHT_UNPADDED = { + 7: torch.Tensor([[0.2205, 0.1232, -0.1611], [-0.3484, 0.3030, -1.0312], [0.0742, 0.7930, 0.7969]]).to( + torch_device + ), + 8: torch.Tensor([[0.2178, 0.1260, -0.1621], [-0.3496, 0.2988, -1.0312], [0.0693, 0.7930, 0.8008]]).to( + torch_device + ), + } with torch.no_grad(): logits = model(dummy_input, attention_mask=attention_mask).logits - torch.testing.assert_close(logits[0, :3, :3].half(), EXPECTED_LOGITS_LEFT, atol=1e-3, rtol=1e-3) - torch.testing.assert_close(logits[0, -3:, -3:].half(), EXPECTED_LOGITS_LEFT_UNPADDED, atol=1e-3, rtol=1e-3) - torch.testing.assert_close(logits[1, -3:, -3:].half(), EXPECTED_LOGITS_RIGHT_UNPADDED, atol=1e-3, rtol=1e-3) + torch.testing.assert_close( + logits[0, :3, :3], EXPECTED_LOGITS_LEFT[self.cuda_compute_capability_major_version], atol=1e-3, rtol=1e-3 + ) + torch.testing.assert_close( + logits[0, -3:, -3:], + EXPECTED_LOGITS_LEFT_UNPADDED[self.cuda_compute_capability_major_version], + atol=1e-3, + rtol=1e-3, + ) + torch.testing.assert_close( + logits[1, -3:, -3:], + EXPECTED_LOGITS_RIGHT_UNPADDED[self.cuda_compute_capability_major_version], + atol=1e-3, + rtol=1e-3, + ) diff --git a/tests/models/whisper/test_modeling_whisper.py b/tests/models/whisper/test_modeling_whisper.py index a36bd5f216..a078eb375c 100644 --- a/tests/models/whisper/test_modeling_whisper.py +++ b/tests/models/whisper/test_modeling_whisper.py @@ -3339,3 +3339,21 @@ class WhisperStandaloneDecoderModelTest(ModelTesterMixin, GenerationTesterMixin, @unittest.skip("The model doesn't support fast init from base") def test_save_load_fast_init_from_base(self): pass + + @unittest.skip( + "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" + ) + def test_flash_attn_2_generate_padding_right(self): + pass + + @unittest.skip( + "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" + ) + def test_flash_attn_2_inference(self): + pass + + @unittest.skip( + "Duplicated test with WhisperModelTest + the FA2 testing suite needs to be refactored to be compatible with WhisperDecoder for that test" + ) + def test_flash_attn_2_inference_padding_right(self): + pass diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 7241993b6d..e92aca1cd7 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -3245,6 +3245,7 @@ class ModelTesterMixin: @require_torch_gpu @mark.flash_attn_test @slow + @is_flaky def test_flash_attn_2_inference_equivalence(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -3338,6 +3339,7 @@ class ModelTesterMixin: @require_torch_gpu @mark.flash_attn_test @slow + @is_flaky def test_flash_attn_2_inference_equivalence_right_padding(self): for model_class in self.all_model_classes: if not model_class._supports_flash_attn_2: @@ -3427,6 +3429,7 @@ class ModelTesterMixin: @require_torch_gpu @mark.flash_attn_test @slow + @is_flaky def test_flash_attn_2_generate_left_padding(self): for model_class in self.all_generative_model_classes: if not model_class._supports_flash_attn_2: @@ -3470,6 +3473,7 @@ class ModelTesterMixin: @require_flash_attn @require_torch_gpu @mark.flash_attn_test + @is_flaky @slow def test_flash_attn_2_generate_padding_right(self): for model_class in self.all_generative_model_classes: @@ -3888,19 +3892,20 @@ class ModelTesterMixin: for model_class in self.all_generative_model_classes: if not model_class._supports_flash_attn_2: self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() model = model_class(config) - with tempfile.TemporaryDirectory() as tmpdirname: model.save_pretrained(tmpdirname) dummy_input = inputs_dict[model.main_input_name] dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input)) + batch_size = dummy_attention_mask.shape[0] - if model.config.is_encoder_decoder: - dummy_decoder_input_ids = inputs_dict["decoder_input_ids"] - dummy_decoder_attention_mask = inputs_dict["decoder_attention_mask"] + is_padding_right = dummy_attention_mask[:, -1].sum().item() != batch_size + + # To avoid errors with padding_side=="right" + if is_padding_right: + dummy_attention_mask = torch.ones_like(dummy_input) model = model_class.from_pretrained( tmpdirname, @@ -3916,6 +3921,9 @@ class ModelTesterMixin: param.data = param.data.to(torch.float32) if model.config.is_encoder_decoder: + dummy_decoder_input_ids = inputs_dict["decoder_input_ids"] + dummy_decoder_attention_mask = inputs_dict["decoder_attention_mask"] + _ = model(dummy_input, decoder_input_ids=dummy_decoder_input_ids) # with attention mask _ = model( From 6487e9b370da32dde8819fbe4422878dabba5211 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:03:36 +0200 Subject: [PATCH 061/110] Send headers when converting safetensors (#30144) Co-authored-by: Wauplin --- src/transformers/safetensors_conversion.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/safetensors_conversion.py b/src/transformers/safetensors_conversion.py index 5d3af9e8aa..a80927d61e 100644 --- a/src/transformers/safetensors_conversion.py +++ b/src/transformers/safetensors_conversion.py @@ -5,7 +5,7 @@ from typing import Optional import requests from huggingface_hub import Discussion, HfApi, get_repo_discussions -from .utils import cached_file, logging +from .utils import cached_file, http_user_agent, logging logger = logging.get_logger(__name__) @@ -86,7 +86,7 @@ def get_conversion_pr_reference(api: HfApi, model_id: str, **kwargs): def auto_conversion(pretrained_model_name_or_path: str, ignore_errors_during_conversion=False, **cached_file_kwargs): try: - api = HfApi(token=cached_file_kwargs.get("token")) + api = HfApi(token=cached_file_kwargs.get("token"), headers=http_user_agent()) sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs) if sha is None: From 58a939c6b77ec36b890c441a6a07d3ef0b8dd874 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 9 Apr 2024 17:10:29 +0200 Subject: [PATCH 062/110] Fix quantization tests (#29914) * revert back to torch 2.1.1 * run test * switch to torch 2.2.1 * udapte dockerfile * fix awq tests * fix test * run quanto tests * update tests * split quantization tests * fix * fix again * final fix * fix report artifact * build docker again * Revert "build docker again" This reverts commit 399a5f9d9308da071d79034f238c719de0f3532e. * debug * revert * style * new notification system * testing notfication * rebuild docker * fix_prev_ci_results * typo * remove warning * fix typo * fix artifact name * debug * issue fixed * debug again * fix * fix time * test notif with faling test * typo * issues again * final fix ? * run all quantization tests again * remove name to clear space * revert modfiication done on workflow * fix * build docker * build only quant docker * fix quantization ci * fix * fix report * better quantization_matrix * add print * revert to the basic one --- .github/workflows/self-scheduled.yml | 36 ++- .github/workflows/slack-report.yml | 25 +- .../Dockerfile | 8 +- src/transformers/utils/quantization_config.py | 2 +- tests/quantization/autoawq/test_awq.py | 30 +-- utils/notification_service.py | 2 - utils/notification_service_quantization.py | 251 ++++++++++++++++++ 7 files changed, 324 insertions(+), 30 deletions(-) create mode 100644 utils/notification_service_quantization.py diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 3e563e94e1..81620b740b 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -33,7 +33,6 @@ env: jobs: setup: - if: ${{ inputs.job == 'run_tests_gpu' }} name: Setup strategy: matrix: @@ -45,6 +44,7 @@ jobs: outputs: folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} + quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }} steps: - name: Update clone working-directory: /transformers @@ -63,11 +63,19 @@ jobs: run: pip freeze - id: set-matrix + if: ${{ inputs.job == 'run_tests_gpu' }} name: Identify models to test working-directory: /transformers/tests run: | echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + + - id: set-matrix-quantization + if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} + name: Identify quantization method to test + working-directory: /transformers/tests + run: | + echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT - name: NVIDIA-SMI run: | @@ -303,16 +311,26 @@ jobs: run_tests_quantization_torch_gpu: if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} - name: Quantization tests + name: " " strategy: fail-fast: false matrix: + folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} machine_type: [single-gpu, multi-gpu] runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci] container: image: huggingface/transformers-quantization-latest-gpu options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'quantization/'/'quantization_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + - name: Update clone working-directory: /transformers run: git fetch && git checkout ${{ github.sha }} @@ -337,19 +355,19 @@ jobs: - name: Run quantization tests on GPU working-directory: /transformers run: | - python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu tests/quantization + python3 -m pytest -v --make-reports=${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} tests/${{ matrix.folders }} - name: Failure short reports if: ${{ failure() }} continue-on-error: true - run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu/failures_short.txt + run: cat /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }}/failures_short.txt - - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu" + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}" if: ${{ always() }} uses: actions/upload-artifact@v3 with: - name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu - path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu + name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }} + path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} run_extract_warnings: # Let's only do this for the job `run_tests_gpu` to simplify the (already complex) logic. @@ -413,4 +431,6 @@ jobs: slack_report_channel: ${{ inputs.slack_report_channel }} # This would be an empty string if `setup` is skipped. folder_slices: ${{ needs.setup.outputs.folder_slices }} - secrets: inherit \ No newline at end of file + quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} + + secrets: inherit diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 0e964e8596..9e62417c76 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -15,6 +15,9 @@ on: folder_slices: required: true type: string + quantization_matrix: + required: true + type: string jobs: @@ -32,6 +35,7 @@ jobs: - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 - name: Send message to Slack + if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }} env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} @@ -53,7 +57,26 @@ jobs: pip install slack_sdk pip show slack_sdk python utils/notification_service.py "${{ inputs.folder_slices }}" - + + - uses: actions/checkout@v3 + - uses: actions/download-artifact@v3 + - name: Send message to Slack for quantization workflow + if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + SLACK_REPORT_CHANNEL: ${{ inputs.slack_report_channel }} + CI_EVENT: scheduled + CI_SHA: ${{ github.sha }} + SETUP_STATUS: ${{ inputs.setup_status }} + # We pass `needs.setup.outputs.quantization_matrix` as the argument. A processing in `notification_service_quantization.py` to change + # `quantization/bnb` to `quantization_bnb` is required, as the artifact names use `_` instead of `/`. + run: | + sudo apt-get install -y curl + pip install slack_sdk + pip show slack_sdk + python utils/notification_service_quantization.py "${{ inputs.quantization_matrix }}" + # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts # Only the model testing job is concerned for this step diff --git a/docker/transformers-quantization-latest-gpu/Dockerfile b/docker/transformers-quantization-latest-gpu/Dockerfile index 8a526c7298..e1d084c403 100644 --- a/docker/transformers-quantization-latest-gpu/Dockerfile +++ b/docker/transformers-quantization-latest-gpu/Dockerfile @@ -9,7 +9,7 @@ SHELL ["sh", "-lc"] # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant # to be used as arguments for docker build (so far). -ARG PYTORCH='2.2.0' +ARG PYTORCH='2.2.1' # Example: `cu102`, `cu113`, etc. ARG CUDA='cu118' @@ -30,6 +30,9 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch] RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate +# needed in bnb and awq +RUN python3 -m pip install --no-cache-dir einops + # Add bitsandbytes for mixed int8 testing RUN python3 -m pip install --no-cache-dir bitsandbytes @@ -43,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2 # Add autoawq for quantization testing -RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.0/autoawq-0.2.0+cu118-cp38-cp38-linux_x86_64.whl +# >=v0.2.3 needed for compatibility with torch 2.2.1 +RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl # Add quanto for quantization testing RUN python3 -m pip install --no-cache-dir quanto diff --git a/src/transformers/utils/quantization_config.py b/src/transformers/utils/quantization_config.py index 69bb0d5272..d91ecef16e 100644 --- a/src/transformers/utils/quantization_config.py +++ b/src/transformers/utils/quantization_config.py @@ -789,7 +789,7 @@ class AwqConfig(QuantizationConfigMixin): def get_loading_attributes(self): attibutes_dict = copy.deepcopy(self.__dict__) - loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len"] + loading_attibutes = ["version", "do_fuse", "modules_to_fuse", "fuse_max_seq_len", "exllama_config"] loading_attibutes_dict = {i: j for i, j in attibutes_dict.items() if i in loading_attibutes} return loading_attibutes_dict diff --git a/tests/quantization/autoawq/test_awq.py b/tests/quantization/autoawq/test_awq.py index 8ed8c394f4..8215f3f145 100644 --- a/tests/quantization/autoawq/test_awq.py +++ b/tests/quantization/autoawq/test_awq.py @@ -101,7 +101,7 @@ class AwqTest(unittest.TestCase): EXPECTED_OUTPUT = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Journalism and minoring in Spanish" EXPECTED_OUTPUT_BF16 = "Hello my name is Katie and I am a 20 year old student at the University of North Carolina at Chapel Hill. I am a junior and I am majoring in Exercise and Sport Science with a" - + EXPECTED_OUTPUT_EXLLAMA = "Hello my name is Katie and I am a 20 year old student from the UK. I am currently studying for a degree in English Literature and History at the University of York. I am a very out" device_map = "cuda" # called only once for all test in this class @@ -200,11 +200,11 @@ class AwqTest(unittest.TestCase): quantization_config = AwqConfig(version="exllama") quantized_model = AutoModelForCausalLM.from_pretrained( - self.model_name, quantization_config=quantization_config - ).to(torch_device) + self.model_name, quantization_config=quantization_config, device_map=torch_device + ) output = quantized_model.generate(**input_ids, max_new_tokens=40) - self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT) + self.assertEqual(self.tokenizer.decode(output[0], skip_special_tokens=True), self.EXPECTED_OUTPUT_EXLLAMA) def test_quantized_model_no_device_map(self): """ @@ -239,7 +239,7 @@ class AwqTest(unittest.TestCase): quantized_model = AutoModelForCausalLM.from_pretrained(self.model_name, device_map="auto") - self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1, 2, 3}) + self.assertTrue(set(quantized_model.hf_device_map.values()) == {0, 1}) output = quantized_model.generate(**input_ids, max_new_tokens=40) @@ -272,8 +272,8 @@ class AwqFusedTest(unittest.TestCase): model_name = "TheBloke/Mistral-7B-OpenOrca-AWQ" model_revision = "7048b2af77d0dd1c81b000b19d73f9cc8950b510" - custom_mapping_model_id = "TheBloke/Yi-34B-AWQ" - custom_model_revision = "f1b2cd1b7459ceecfdc1fac5bb8725f13707c589" + custom_mapping_model_id = "TheBloke/Mistral-7B-v0.1-AWQ" + custom_model_revision = "f186bcfa9edbe2a4334262ec1e67f23e53ed1ae7" mixtral_model_name = "casperhansen/mixtral-instruct-awq" mixtral_model_revision = "87dd4ec502dde74fb3a624835c776b000d190c3b" @@ -287,8 +287,8 @@ class AwqFusedTest(unittest.TestCase): "You end up exactly where you started. Where are you?" ) - EXPECTED_GENERATION = prompt + "\n\nThis is a classic puzzle that has been around for" - EXPECTED_GENERATION_CUSTOM_MODEL = "HelloWorld.java:11)\r\n\tat org" + EXPECTED_GENERATION = prompt + "\n\nYou are at the starting point.\n\nIf" + EXPECTED_GENERATION_CUSTOM_MODEL = "Hello,\n\nI have a problem with my 20" EXPECTED_GENERATION_MIXTRAL = prompt + " You're on the North Pole.\n\nThe" def tearDown(self): @@ -423,28 +423,25 @@ class AwqFusedTest(unittest.TestCase): fuse_max_seq_len=512, modules_to_fuse={ "attention": ["q_proj", "k_proj", "v_proj", "o_proj"], - "layernorm": ["ln1", "ln2", "norm"], "mlp": ["gate_proj", "up_proj", "down_proj"], + "layernorm": ["input_layernorm", "post_attention_layernorm", "norm"], "use_alibi": False, - "num_attention_heads": 56, + "hidden_size": 4096, + "num_attention_heads": 32, "num_key_value_heads": 8, - "hidden_size": 7168, }, ) model = AutoModelForCausalLM.from_pretrained( self.custom_mapping_model_id, quantization_config=quantization_config, - trust_remote_code=True, device_map="balanced", revision=self.custom_model_revision, ) self._check_fused_modules(model) - tokenizer = AutoTokenizer.from_pretrained( - self.custom_mapping_model_id, revision=self.custom_model_revision, trust_remote_code=True - ) + tokenizer = AutoTokenizer.from_pretrained(self.custom_mapping_model_id, revision=self.custom_model_revision) prompt = "Hello" inputs = tokenizer(prompt, return_tensors="pt").to(torch_device) @@ -452,6 +449,7 @@ class AwqFusedTest(unittest.TestCase): outputs = model.generate(**inputs, max_new_tokens=12) self.assertEqual(tokenizer.decode(outputs[0], skip_special_tokens=True), self.EXPECTED_GENERATION_CUSTOM_MODEL) + @unittest.skip("Not enough GPU memory on CI runners") @require_torch_multi_gpu def test_generation_mixtral_fused(self): """ diff --git a/utils/notification_service.py b/utils/notification_service.py index 5378348ee9..158e01942b 100644 --- a/utils/notification_service.py +++ b/utils/notification_service.py @@ -1056,7 +1056,6 @@ if __name__ == "__main__": "TensorFlow pipelines": "run_tests_tf_pipeline_gpu", "Examples directory": "run_examples_gpu", "Torch CUDA extension tests": "run_tests_torch_cuda_extensions_gpu_test_reports", - "Quantization tests": "run_tests_quantization_torch_gpu", } if ci_event in ["push", "Nightly CI"] or ci_event.startswith("Past CI"): @@ -1077,7 +1076,6 @@ if __name__ == "__main__": "run_pipelines_tf_gpu": "TensorFlow pipelines", "run_examples_gpu": "Examples directory", "run_all_tests_torch_cuda_extensions_gpu": "Torch CUDA extension tests", - "run_tests_quantization_torch_gpu": "Quantization tests", } # Remove some entries in `additional_files` if they are not concerned. diff --git a/utils/notification_service_quantization.py b/utils/notification_service_quantization.py new file mode 100644 index 0000000000..11bc57e618 --- /dev/null +++ b/utils/notification_service_quantization.py @@ -0,0 +1,251 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import ast +import json +import os +import sys +import time +from typing import Dict + +from get_ci_error_statistics import get_jobs +from notification_service import ( + Message, + handle_stacktraces, + handle_test_results, + prepare_reports, + retrieve_artifact, + retrieve_available_artifacts, +) +from slack_sdk import WebClient + + +client = WebClient(token=os.environ["CI_SLACK_BOT_TOKEN"]) + + +class QuantizationMessage(Message): + def __init__( + self, + title: str, + results: Dict, + ): + self.title = title + + # Failures and success of the modeling tests + self.n_success = sum(r["success"] for r in results.values()) + self.single_gpu_failures = sum(r["failed"]["single"] for r in results.values()) + self.multi_gpu_failures = sum(r["failed"]["multi"] for r in results.values()) + self.n_failures = self.single_gpu_failures + self.multi_gpu_failures + + self.n_tests = self.n_failures + self.n_success + self.results = results + self.thread_ts = None + + @property + def payload(self) -> str: + blocks = [self.header] + + if self.n_failures > 0: + blocks.append(self.failures_overwiew) + blocks.append(self.failures_detailed) + + if self.n_failures == 0: + blocks.append(self.no_failures) + + return json.dumps(blocks) + + @property + def time(self) -> str: + all_results = self.results.values() + time_spent = [] + for r in all_results: + if len(r["time_spent"]): + time_spent.extend([x for x in r["time_spent"].split(", ") if len(x.strip())]) + total_secs = 0 + + for time in time_spent: + time_parts = time.split(":") + + # Time can be formatted as xx:xx:xx, as .xx, or as x.xx if the time spent was less than a minute. + if len(time_parts) == 1: + time_parts = [0, 0, time_parts[0]] + + hours, minutes, seconds = int(time_parts[0]), int(time_parts[1]), float(time_parts[2]) + total_secs += hours * 3600 + minutes * 60 + seconds + + hours, minutes, seconds = total_secs // 3600, (total_secs % 3600) // 60, total_secs % 60 + return f"{int(hours)}h{int(minutes)}m{int(seconds)}s" + + @property + def failures_overwiew(self) -> Dict: + return { + "type": "section", + "text": { + "type": "plain_text", + "text": ( + f"There were {self.n_failures} failures, out of {self.n_tests} tests.\n" + f"The suite ran in {self.time}." + ), + "emoji": True, + }, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Action results", "emoji": True}, + "url": f"https://github.com/huggingface/transformers/actions/runs/{os.environ['GITHUB_RUN_ID']}", + }, + } + + @property + def failures_detailed(self) -> Dict: + failures = {k: v["failed"] for k, v in self.results.items()} + + individual_reports = [] + for key, value in failures.items(): + device_report = self.get_device_report(value) + if sum(value.values()): + report = f"{device_report}{key}" + individual_reports.append(report) + + header = "Single | Multi | Category\n" + failures_report = prepare_reports( + title="The following quantization tests had failures", header=header, reports=individual_reports + ) + + return {"type": "section", "text": {"type": "mrkdwn", "text": failures_report}} + + def post(self): + payload = self.payload + print("Sending the following payload") + print(json.dumps({"blocks": json.loads(payload)})) + + text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." + + self.thread_ts = client.chat_postMessage( + channel=SLACK_REPORT_CHANNEL_ID, + blocks=payload, + text=text, + ) + + def post_reply(self): + if self.thread_ts is None: + raise ValueError("Can only post reply if a post has been made.") + + for job, job_result in self.results.items(): + if len(job_result["failures"]): + for device, failures in job_result["failures"].items(): + blocks = self.get_reply_blocks( + job, + job_result, + failures, + device, + text=f'Number of failures: {job_result["failed"][device]}', + ) + + print("Sending the following reply") + print(json.dumps({"blocks": blocks})) + + client.chat_postMessage( + channel="#transformers-ci-daily-quantization", + text=f"Results for {job}", + blocks=blocks, + thread_ts=self.thread_ts["ts"], + ) + time.sleep(1) + + +if __name__ == "__main__": + setup_status = os.environ.get("SETUP_STATUS") + SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"] + setup_failed = True if setup_status is not None and setup_status != "success" else False + + # This env. variable is set in workflow file (under the job `send_results`). + ci_event = os.environ["CI_EVENT"] + + title = f"🤗 Results of the {ci_event} tests." + + if setup_failed: + Message.error_out( + title, ci_title="", runner_not_available=False, runner_failed=False, setup_failed=setup_failed + ) + exit(0) + + arguments = sys.argv[1:][0] + try: + quantization_matrix = ast.literal_eval(arguments) + # Need to change from elements like `quantization/bnb` to `quantization_bnb` (the ones used as artifact names). + quantization_matrix = [x.replace("quantization/", "quantization_") for x in quantization_matrix] + except SyntaxError: + Message.error_out(title, ci_title="") + raise ValueError("Errored out.") + + available_artifacts = retrieve_available_artifacts() + + quantization_results = { + quant: { + "failed": {"single": 0, "multi": 0}, + "success": 0, + "time_spent": "", + "failures": {}, + "job_link": {}, + } + for quant in quantization_matrix + if f"run_tests_quantization_torch_gpu_{quant}" in available_artifacts + } + + github_actions_jobs = get_jobs( + workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"] + ) + github_actions_job_links = {job["name"]: job["html_url"] for job in github_actions_jobs} + + artifact_name_to_job_map = {} + for job in github_actions_jobs: + for step in job["steps"]: + if step["name"].startswith("Test suite reports artifacts: "): + artifact_name = step["name"][len("Test suite reports artifacts: ") :] + artifact_name_to_job_map[artifact_name] = job + break + + for quant in quantization_results.keys(): + for artifact_path in available_artifacts[f"run_tests_quantization_torch_gpu_{quant}"].paths: + artifact = retrieve_artifact(artifact_path["path"], artifact_path["gpu"]) + if "stats" in artifact: + # Link to the GitHub Action job + job = artifact_name_to_job_map[artifact_path["path"]] + quantization_results[quant]["job_link"][artifact_path["gpu"]] = job["html_url"] + failed, success, time_spent = handle_test_results(artifact["stats"]) + quantization_results[quant]["failed"][artifact_path["gpu"]] += failed + quantization_results[quant]["success"] += success + quantization_results[quant]["time_spent"] += time_spent[1:-1] + ", " + + stacktraces = handle_stacktraces(artifact["failures_line"]) + + for line in artifact["summary_short"].split("\n"): + if line.startswith("FAILED "): + line = line[len("FAILED ") :] + line = line.split()[0].replace("\n", "") + + if artifact_path["gpu"] not in quantization_results[quant]["failures"]: + quantization_results[quant]["failures"][artifact_path["gpu"]] = [] + + quantization_results[quant]["failures"][artifact_path["gpu"]].append( + {"line": line, "trace": stacktraces.pop(0)} + ) + + message = QuantizationMessage( + title, + results=quantization_results, + ) + + message.post() + message.post_reply() From 21e23ffca75575190a0a6088f2b51d38a098d4c4 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Tue, 9 Apr 2024 09:08:37 -0700 Subject: [PATCH 063/110] [docs] Fix image segmentation guide (#30132) fixes --- docs/source/en/tasks/semantic_segmentation.md | 179 +++++++++++++++++- 1 file changed, 173 insertions(+), 6 deletions(-) diff --git a/docs/source/en/tasks/semantic_segmentation.md b/docs/source/en/tasks/semantic_segmentation.md index e99499bbbb..675f9222ca 100644 --- a/docs/source/en/tasks/semantic_segmentation.md +++ b/docs/source/en/tasks/semantic_segmentation.md @@ -28,8 +28,9 @@ In this guide, we will: Before you begin, make sure you have all the necessary libraries installed: -```bash -pip install -q datasets transformers evaluate +```py +# uncomment to install the necessary libraries +!pip install -q datasets transformers evaluate accelerate ``` We encourage you to log in to your Hugging Face account so you can upload and share your model with the community. When prompted, enter your token to log in: @@ -236,6 +237,9 @@ Then take a look at an example: {'image': , 'annotation': , 'scene_category': 368} + +# view the image +>>> train_ds[0]["image"] ``` - `image`: a PIL image of the scene. @@ -663,15 +667,19 @@ Congratulations! You have fine-tuned your model and shared it on the 🤗 Hub. Y - ### Inference Great, now that you've finetuned a model, you can use it for inference! -Load an image for inference: +Reload the dataset and load an image for inference. ```py ->>> image = ds[0]["image"] +>>> from datasets import load_dataset + +>>> ds = load_dataset("scene_parse_150", split="train[:50]") +>>> ds = ds.train_test_split(test_size=0.2) +>>> test_ds = ds["test"] +>>> image = ds["test"][0]["image"] >>> image ``` @@ -749,7 +757,166 @@ Next, rescale the logits to the original image size and apply argmax on the clas -To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values. Then you can combine and plot your image and the predicted segmentation map: +To visualize the results, load the [dataset color palette](https://github.com/tensorflow/models/blob/3f1ca33afe3c1631b733ea7e40c294273b9e406d/research/deeplab/utils/get_dataset_colormap.py#L51) as `ade_palette()` that maps each class to their RGB values. + +```py +def ade_palette(): + return np.asarray([ + [0, 0, 0], + [120, 120, 120], + [180, 120, 120], + [6, 230, 230], + [80, 50, 50], + [4, 200, 3], + [120, 120, 80], + [140, 140, 140], + [204, 5, 255], + [230, 230, 230], + [4, 250, 7], + [224, 5, 255], + [235, 255, 7], + [150, 5, 61], + [120, 120, 70], + [8, 255, 51], + [255, 6, 82], + [143, 255, 140], + [204, 255, 4], + [255, 51, 7], + [204, 70, 3], + [0, 102, 200], + [61, 230, 250], + [255, 6, 51], + [11, 102, 255], + [255, 7, 71], + [255, 9, 224], + [9, 7, 230], + [220, 220, 220], + [255, 9, 92], + [112, 9, 255], + [8, 255, 214], + [7, 255, 224], + [255, 184, 6], + [10, 255, 71], + [255, 41, 10], + [7, 255, 255], + [224, 255, 8], + [102, 8, 255], + [255, 61, 6], + [255, 194, 7], + [255, 122, 8], + [0, 255, 20], + [255, 8, 41], + [255, 5, 153], + [6, 51, 255], + [235, 12, 255], + [160, 150, 20], + [0, 163, 255], + [140, 140, 140], + [250, 10, 15], + [20, 255, 0], + [31, 255, 0], + [255, 31, 0], + [255, 224, 0], + [153, 255, 0], + [0, 0, 255], + [255, 71, 0], + [0, 235, 255], + [0, 173, 255], + [31, 0, 255], + [11, 200, 200], + [255, 82, 0], + [0, 255, 245], + [0, 61, 255], + [0, 255, 112], + [0, 255, 133], + [255, 0, 0], + [255, 163, 0], + [255, 102, 0], + [194, 255, 0], + [0, 143, 255], + [51, 255, 0], + [0, 82, 255], + [0, 255, 41], + [0, 255, 173], + [10, 0, 255], + [173, 255, 0], + [0, 255, 153], + [255, 92, 0], + [255, 0, 255], + [255, 0, 245], + [255, 0, 102], + [255, 173, 0], + [255, 0, 20], + [255, 184, 184], + [0, 31, 255], + [0, 255, 61], + [0, 71, 255], + [255, 0, 204], + [0, 255, 194], + [0, 255, 82], + [0, 10, 255], + [0, 112, 255], + [51, 0, 255], + [0, 194, 255], + [0, 122, 255], + [0, 255, 163], + [255, 153, 0], + [0, 255, 10], + [255, 112, 0], + [143, 255, 0], + [82, 0, 255], + [163, 255, 0], + [255, 235, 0], + [8, 184, 170], + [133, 0, 255], + [0, 255, 92], + [184, 0, 255], + [255, 0, 31], + [0, 184, 255], + [0, 214, 255], + [255, 0, 112], + [92, 255, 0], + [0, 224, 255], + [112, 224, 255], + [70, 184, 160], + [163, 0, 255], + [153, 0, 255], + [71, 255, 0], + [255, 0, 163], + [255, 204, 0], + [255, 0, 143], + [0, 255, 235], + [133, 255, 0], + [255, 0, 235], + [245, 0, 255], + [255, 0, 122], + [255, 245, 0], + [10, 190, 212], + [214, 255, 0], + [0, 204, 255], + [20, 0, 255], + [255, 255, 0], + [0, 153, 255], + [0, 41, 255], + [0, 255, 204], + [41, 0, 255], + [41, 255, 0], + [173, 0, 255], + [0, 245, 255], + [71, 0, 255], + [122, 0, 255], + [0, 255, 184], + [0, 92, 255], + [184, 255, 0], + [0, 133, 255], + [255, 214, 0], + [25, 194, 194], + [102, 255, 0], + [92, 0, 255], + ]) +``` + +Then you can combine and plot your image and the predicted segmentation map: ```py >>> import matplotlib.pyplot as plt From 6cdbd73e01a9719bfaec07d91fd108e8d932bbbb Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Tue, 9 Apr 2024 18:10:00 +0200 Subject: [PATCH 064/110] [CI] Fix setup (#30147) * [CI] fix setup * fix * test * Revert "test" This reverts commit 7df416d45074439e2fa1b78afd24eacf37ce072f. --- .github/workflows/self-scheduled.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 81620b740b..62edf20b64 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -33,6 +33,7 @@ env: jobs: setup: + if: contains(fromJSON('["run_tests_gpu", "run_tests_quantization_torch_gpu"]'), inputs.job) name: Setup strategy: matrix: From 41579763ee47a68d0c15fd798db1a309723d94d2 Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 10 Apr 2024 12:45:07 +0500 Subject: [PATCH 065/110] Fix length related warnings in speculative decoding (#29585) * avoid generation length warning * add tests * Update src/transformers/generation/candidate_generator.py Co-authored-by: Joao Gante * add tests and minor fixes * refine `min_new_tokens` * Update src/transformers/generation/candidate_generator.py Co-authored-by: Joao Gante * add method to prepare length arguments * add test for min length * Update src/transformers/generation/candidate_generator.py Co-authored-by: Joao Gante * fix variable naming * empty commit for tests * trigger tests (empty) --------- Co-authored-by: Joao Gante --- .../generation/candidate_generator.py | 7 ++ src/transformers/generation/utils.py | 79 ++++++++++++++----- tests/generation/test_utils.py | 64 +++++++++++++++ 3 files changed, 131 insertions(+), 19 deletions(-) diff --git a/src/transformers/generation/candidate_generator.py b/src/transformers/generation/candidate_generator.py index 0859021956..735431fe6f 100644 --- a/src/transformers/generation/candidate_generator.py +++ b/src/transformers/generation/candidate_generator.py @@ -148,6 +148,11 @@ class AssistedCandidateGenerator(CandidateGenerator): self.generation_config.return_dict_in_generate = True self.generation_config.output_scores = True + # avoid unnecessary warnings that min_length is larger than max_new_tokens + self.main_model_min_length = self.generation_config.min_length + self.generation_config.min_length = 0 + self.generation_config.min_new_tokens = None + def get_candidates(self, input_ids: torch.LongTensor) -> Tuple[torch.LongTensor, Optional[torch.FloatTensor]]: """ Fetches the candidates to be tried for the current input. @@ -166,6 +171,7 @@ class AssistedCandidateGenerator(CandidateGenerator): # Don't generate more than `max_length - 1` candidates since the target model generates one extra token. new_cur_len = input_ids.shape[-1] max_new_tokens = min(int(self.num_assistant_tokens), self.generation_config.max_length - new_cur_len - 1) + min_new_tokens = max(min(max_new_tokens, self.main_model_min_length - new_cur_len), 0) if max_new_tokens == 0: return input_ids, None @@ -186,6 +192,7 @@ class AssistedCandidateGenerator(CandidateGenerator): # 2. Forecast next N tokens using the assistant model. assistant_generation_kwargs = { self.input_ids_key: input_ids, + "min_new_tokens": min_new_tokens, "max_new_tokens": max_new_tokens, "generation_config": self.generation_config, "logits_processor": self.logits_processor, diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py index cb3ac0ff1d..36e62794a4 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -1173,6 +1173,56 @@ class GenerationMixin: UserWarning, ) + def _prepare_generated_length( + self, + generation_config, + has_default_max_length, + has_default_min_length, + model_input_name, + input_ids_length, + inputs_tensor, + ): + """Prepared max and min length in generaion configs to avoid clashes between similar attributes""" + + if generation_config.max_new_tokens is not None: + if not has_default_max_length and generation_config.max_length is not None: + logger.warning( + f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" + f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" + ) + generation_config.max_length = generation_config.max_new_tokens + input_ids_length + + # if both `inputs_embeds` and `input_ids` are passed, we do not correct the length + # otherwise we need total length [inputs-embeds-len + new-tokens-len] to not go beyond indicated `max_length`` + elif ( + model_input_name == "inputs_embeds" + and input_ids_length != inputs_tensor.shape[1] + and not self.config.is_encoder_decoder + ): + generation_config.max_length -= inputs_tensor.shape[1] + + # same for min length + if generation_config.min_new_tokens is not None: + if not has_default_min_length: + logger.warning( + f"Both `min_new_tokens` (={generation_config.min_new_tokens}) and `min_length`(=" + f"{generation_config.min_length}) seem to have been set. `min_new_tokens` will take precedence. " + "Please refer to the documentation for more information. " + "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" + ) + generation_config.min_length = generation_config.min_new_tokens + input_ids_length + + elif ( + model_input_name == "inputs_embeds" + and input_ids_length != inputs_tensor.shape[1] + and not self.config.is_encoder_decoder + ): + generation_config.min_length = max(generation_config.min_length - inputs_tensor.shape[1], 0) + + return generation_config + def _prepare_generation_config( self, generation_config: GenerationConfig, **kwargs: Dict ) -> Tuple[GenerationConfig, Dict]: @@ -1418,24 +1468,15 @@ class GenerationMixin: # 6. Prepare `max_length` depending on other stopping criteria. input_ids_length = input_ids.shape[-1] has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None - if generation_config.max_new_tokens is not None: - if not has_default_max_length and generation_config.max_length is not None: - logger.warning( - f"Both `max_new_tokens` (={generation_config.max_new_tokens}) and `max_length`(=" - f"{generation_config.max_length}) seem to have been set. `max_new_tokens` will take precedence. " - "Please refer to the documentation for more information. " - "(https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)" - ) - generation_config.max_length = generation_config.max_new_tokens + input_ids_length - - # otherwise the total length [inputs-embeds-len + new-tokens-len] will go beyond indicated `max_length`` - elif ( - model_input_name == "inputs_embeds" - and inputs_tensor.shape[:-1] != input_ids.shape - and not self.config.is_encoder_decoder - ): - generation_config.max_length -= inputs_tensor.shape[1] - generation_config.min_length = max(generation_config.min_length - inputs_tensor.shape[1], 0) + has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None + generation_config = self._prepare_generated_length( + generation_config=generation_config, + has_default_max_length=has_default_max_length, + has_default_min_length=has_default_min_length, + model_input_name=model_input_name, + inputs_tensor=inputs_tensor, + input_ids_length=input_ids_length, + ) if generation_config.cache_implementation in NEED_SETUP_CACHE_CLASSES_MAPPING: if generation_config.cache_implementation == "static": @@ -1511,7 +1552,7 @@ class GenerationMixin: ) # 12. run assisted generate - result = self.assisted_decoding( + result = self._assisted_decoding( input_ids, candidate_generator=candidate_generator, do_sample=generation_config.do_sample, diff --git a/tests/generation/test_utils.py b/tests/generation/test_utils.py index b346b745d8..d6b4840c49 100644 --- a/tests/generation/test_utils.py +++ b/tests/generation/test_utils.py @@ -1977,6 +1977,20 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, max_length=max_length) self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1]) + def test_min_length_if_input_embeds(self): + # PT-only test: TF doesn't have StoppingCriteria + article = "Today a dragon flew over Paris." + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + input_ids = tokenizer(article, return_tensors="pt").input_ids.to(torch_device) + inputs_embeds = model.get_input_embeddings()(input_ids) + + min_length = 10 + input_len = input_ids.shape[-1] + out_gen = model.generate(input_ids=input_ids, min_length=min_length) + out_gen_embeds = model.generate(inputs_embeds=inputs_embeds, min_length=min_length) + self.assertEqual(out_gen.shape[-1], input_len + out_gen_embeds.shape[-1]) + def test_custom_stopping_criteria_overload_error(self): # PT-only test: TF doesn't have StoppingCriteria article = """Justin Timberlake and Jessica Biel, welcome to parenthood.""" @@ -2539,6 +2553,56 @@ class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMi model.generate(input_ids) self.assertEqual(len(warning_list), 0) + def test_length_warning_assisted_generation(self): + # PT-only test: TF doesn't support assisted decoding yet. + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model.config.pad_token_id = tokenizer.eos_token_id + assistant.config.pad_token_id = tokenizer.eos_token_id + + text = "Hello world" + tokenized_inputs = tokenizer([text], return_tensors="pt") + input_ids = tokenized_inputs.input_ids.to(torch_device) + + # This should not raise any warning that min length is not feasible in candidate generation + with warnings.catch_warnings(record=True) as warning_list: + model.generate( + input_ids, + assistant_model=assistant, + min_new_tokens=10, + max_length=20, + ) + self.assertEqual(len(warning_list), 0) + + def test_generated_length_assisted_generation(self): + # PT-only test: TF doesn't support assisted decoding yet. + model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + assistant = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) + tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-gpt2") + model.config.pad_token_id = tokenizer.eos_token_id + assistant.config.pad_token_id = tokenizer.eos_token_id + + text = "Hello world" + tokenized_inputs = tokenizer([text], return_tensors="pt") + input_ids = tokenized_inputs.input_ids.to(torch_device) + input_length = input_ids.shape[-1] + + out = model.generate( + input_ids, + assistant_model=assistant, + min_new_tokens=10, + max_new_tokens=20, + ) + self.assertTrue((10 + input_length) <= out.shape[-1] <= (20 + input_length)) + + out = model.generate( + input_ids, + assistant_model=assistant, + min_new_tokens=10, + ) + self.assertTrue((input_length + 10) <= out.shape[-1] <= 20) + def test_model_kwarg_assisted_decoding_decoder_only(self): # PT-only test: TF doesn't support assisted decoding yet. model = AutoModelForCausalLM.from_pretrained("hf-internal-testing/tiny-random-gpt2").to(torch_device) From 56d001b26f244018cbbb8aa573fc668b877223fa Mon Sep 17 00:00:00 2001 From: Pavel Iakubovskii Date: Wed, 10 Apr 2024 09:10:52 +0100 Subject: [PATCH 066/110] Fix and simplify semantic-segmentation example (#30145) * Remove unused augmentation * Fix pad_if_smaller() and remove unused augmentation * Add indentation * Fix requirements * Update dataset use instructions * Replace transforms with albumentations * Replace identity transform with None * Fixing formatting * Fixed comment place --- examples/pytorch/_tests_requirements.txt | 1 + .../pytorch/semantic-segmentation/README.md | 5 +- .../semantic-segmentation/requirements.txt | 6 +- .../run_semantic_segmentation.py | 210 +++++------------- .../run_semantic_segmentation_no_trainer.py | 206 ++++------------- 5 files changed, 110 insertions(+), 318 deletions(-) diff --git a/examples/pytorch/_tests_requirements.txt b/examples/pytorch/_tests_requirements.txt index d58e2def98..16b5eac32b 100644 --- a/examples/pytorch/_tests_requirements.txt +++ b/examples/pytorch/_tests_requirements.txt @@ -25,3 +25,4 @@ torchaudio jiwer librosa evaluate >= 0.2.0 +albumentations diff --git a/examples/pytorch/semantic-segmentation/README.md b/examples/pytorch/semantic-segmentation/README.md index 3b9d342d48..a0f830e16e 100644 --- a/examples/pytorch/semantic-segmentation/README.md +++ b/examples/pytorch/semantic-segmentation/README.md @@ -97,6 +97,10 @@ The script leverages the [🤗 Trainer API](https://huggingface.co/docs/transfor Here we show how to fine-tune a [SegFormer](https://huggingface.co/nvidia/mit-b0) model on the [segments/sidewalk-semantic](https://huggingface.co/datasets/segments/sidewalk-semantic) dataset: +In order to use `segments/sidewalk-semantic`: + - Log in to Hugging Face with `huggingface-cli login` (token can be accessed [here](https://huggingface.co/settings/tokens)). + - Accept terms of use for `sidewalk-semantic` on [dataset page](https://huggingface.co/datasets/segments/sidewalk-semantic). + ```bash python run_semantic_segmentation.py \ --model_name_or_path nvidia/mit-b0 \ @@ -105,7 +109,6 @@ python run_semantic_segmentation.py \ --remove_unused_columns False \ --do_train \ --do_eval \ - --evaluation_strategy steps \ --push_to_hub \ --push_to_hub_model_id segformer-finetuned-sidewalk-10k-steps \ --max_steps 10000 \ diff --git a/examples/pytorch/semantic-segmentation/requirements.txt b/examples/pytorch/semantic-segmentation/requirements.txt index b839361cf2..7b130d79a6 100644 --- a/examples/pytorch/semantic-segmentation/requirements.txt +++ b/examples/pytorch/semantic-segmentation/requirements.txt @@ -1,4 +1,6 @@ -git://github.com/huggingface/accelerate.git datasets >= 2.0.0 torch >= 1.3 -evaluate \ No newline at end of file +accelerate +evaluate +Pillow +albumentations \ No newline at end of file diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py index 957b78b9b5..8377f808b5 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py @@ -16,21 +16,20 @@ import json import logging import os -import random import sys import warnings from dataclasses import dataclass, field +from functools import partial from typing import Optional +import albumentations as A import evaluate import numpy as np import torch +from albumentations.pytorch import ToTensorV2 from datasets import load_dataset from huggingface_hub import hf_hub_download -from PIL import Image from torch import nn -from torchvision import transforms -from torchvision.transforms import functional import transformers from transformers import ( @@ -57,118 +56,19 @@ check_min_version("4.40.0.dev0") require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") -def pad_if_smaller(img, size, fill=0): - size = (size, size) if isinstance(size, int) else size - original_width, original_height = img.size - pad_height = size[1] - original_height if original_height < size[1] else 0 - pad_width = size[0] - original_width if original_width < size[0] else 0 - img = functional.pad(img, (0, 0, pad_width, pad_height), fill=fill) - return img +def reduce_labels_transform(labels: np.ndarray, **kwargs) -> np.ndarray: + """Set `0` label as with value 255 and then reduce all other labels by 1. + Example: + Initial class labels: 0 - background; 1 - road; 2 - car; + Transformed class labels: 255 - background; 0 - road; 1 - car; -class Compose: - def __init__(self, transforms): - self.transforms = transforms - - def __call__(self, image, target): - for t in self.transforms: - image, target = t(image, target) - return image, target - - -class Identity: - def __init__(self): - pass - - def __call__(self, image, target): - return image, target - - -class Resize: - def __init__(self, size): - self.size = size - - def __call__(self, image, target): - image = functional.resize(image, self.size) - target = functional.resize(target, self.size, interpolation=transforms.InterpolationMode.NEAREST) - return image, target - - -class RandomResize: - def __init__(self, min_size, max_size=None): - self.min_size = min_size - if max_size is None: - max_size = min_size - self.max_size = max_size - - def __call__(self, image, target): - size = random.randint(self.min_size, self.max_size) - image = functional.resize(image, size) - target = functional.resize(target, size, interpolation=transforms.InterpolationMode.NEAREST) - return image, target - - -class RandomCrop: - def __init__(self, size): - self.size = size if isinstance(size, tuple) else (size, size) - - def __call__(self, image, target): - image = pad_if_smaller(image, self.size) - target = pad_if_smaller(target, self.size, fill=255) - crop_params = transforms.RandomCrop.get_params(image, self.size) - image = functional.crop(image, *crop_params) - target = functional.crop(target, *crop_params) - return image, target - - -class RandomHorizontalFlip: - def __init__(self, flip_prob): - self.flip_prob = flip_prob - - def __call__(self, image, target): - if random.random() < self.flip_prob: - image = functional.hflip(image) - target = functional.hflip(target) - return image, target - - -class PILToTensor: - def __call__(self, image, target): - image = functional.pil_to_tensor(image) - target = torch.as_tensor(np.array(target), dtype=torch.int64) - return image, target - - -class ConvertImageDtype: - def __init__(self, dtype): - self.dtype = dtype - - def __call__(self, image, target): - image = functional.convert_image_dtype(image, self.dtype) - return image, target - - -class Normalize: - def __init__(self, mean, std): - self.mean = mean - self.std = std - - def __call__(self, image, target): - image = functional.normalize(image, mean=self.mean, std=self.std) - return image, target - - -class ReduceLabels: - def __call__(self, image, target): - if not isinstance(target, np.ndarray): - target = np.array(target).astype(np.uint8) - # avoid using underflow conversion - target[target == 0] = 255 - target = target - 1 - target[target == 254] = 255 - - target = Image.fromarray(target) - return image, target + **kwargs are required to use this function with albumentations. + """ + labels[labels == 0] = 255 + labels = labels - 1 + labels[labels == 254] = 255 + return labels @dataclass @@ -365,7 +265,7 @@ def main(): id2label = {int(k): v for k, v in id2label.items()} label2id = {v: str(k) for k, v in id2label.items()} - # Load the mean IoU metric from the datasets package + # Load the mean IoU metric from the evaluate package metric = evaluate.load("mean_iou", cache_dir=model_args.cache_dir) # Define our compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a @@ -424,64 +324,62 @@ def main(): token=model_args.token, trust_remote_code=model_args.trust_remote_code, ) + # `reduce_labels` is a property of dataset labels, in case we use image_processor + # pretrained on another dataset we should override the default setting + image_processor.do_reduce_labels = data_args.reduce_labels - # Define torchvision transforms to be applied to each image + target. - # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 - # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py + # Define transforms to be applied to each image and target. if "shortest_edge" in image_processor.size: # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable. - size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"]) + height, width = image_processor.size["shortest_edge"], image_processor.size["shortest_edge"] else: - size = (image_processor.size["height"], image_processor.size["width"]) - train_transforms = Compose( + height, width = image_processor.size["height"], image_processor.size["width"] + train_transforms = A.Compose( [ - ReduceLabels() if data_args.reduce_labels else Identity(), - RandomCrop(size=size), - RandomHorizontalFlip(flip_prob=0.5), - PILToTensor(), - ConvertImageDtype(torch.float), - Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + A.Lambda( + name="reduce_labels", + mask=reduce_labels_transform if data_args.reduce_labels else None, + p=1.0, + ), + # pad image with 255, because it is ignored by loss + A.PadIfNeeded(min_height=height, min_width=width, border_mode=0, value=255, p=1.0), + A.RandomCrop(height=height, width=width, p=1.0), + A.HorizontalFlip(p=0.5), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std, max_pixel_value=255.0, p=1.0), + ToTensorV2(), ] ) - # Define torchvision transform to be applied to each image. - # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) - val_transforms = Compose( + val_transforms = A.Compose( [ - ReduceLabels() if data_args.reduce_labels else Identity(), - Resize(size=size), - PILToTensor(), - ConvertImageDtype(torch.float), - Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + A.Lambda( + name="reduce_labels", + mask=reduce_labels_transform if data_args.reduce_labels else None, + p=1.0, + ), + A.Resize(height=height, width=width, p=1.0), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std, max_pixel_value=255.0, p=1.0), + ToTensorV2(), ] ) - def preprocess_train(example_batch): + def preprocess_batch(example_batch, transforms: A.Compose): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): - image, target = train_transforms(image.convert("RGB"), target) - pixel_values.append(image) - labels.append(target) + transformed = transforms(image=np.array(image.convert("RGB")), mask=np.array(target)) + pixel_values.append(transformed["image"]) + labels.append(transformed["mask"]) encoding = {} - encoding["pixel_values"] = torch.stack(pixel_values) - encoding["labels"] = torch.stack(labels) + encoding["pixel_values"] = torch.stack(pixel_values).to(torch.float) + encoding["labels"] = torch.stack(labels).to(torch.long) return encoding - def preprocess_val(example_batch): - pixel_values = [] - labels = [] - for image, target in zip(example_batch["image"], example_batch["label"]): - image, target = val_transforms(image.convert("RGB"), target) - pixel_values.append(image) - labels.append(target) - - encoding = {} - encoding["pixel_values"] = torch.stack(pixel_values) - encoding["labels"] = torch.stack(labels) - - return encoding + # Preprocess function for dataset should have only one argument, + # so we use partial to pass the transforms + preprocess_train_batch_fn = partial(preprocess_batch, transforms=train_transforms) + preprocess_val_batch_fn = partial(preprocess_batch, transforms=val_transforms) if training_args.do_train: if "train" not in dataset: @@ -491,7 +389,7 @@ def main(): dataset["train"].shuffle(seed=training_args.seed).select(range(data_args.max_train_samples)) ) # Set the training transforms - dataset["train"].set_transform(preprocess_train) + dataset["train"].set_transform(preprocess_train_batch_fn) if training_args.do_eval: if "validation" not in dataset: @@ -501,7 +399,7 @@ def main(): dataset["validation"].shuffle(seed=training_args.seed).select(range(data_args.max_eval_samples)) ) # Set the validation transforms - dataset["validation"].set_transform(preprocess_val) + dataset["validation"].set_transform(preprocess_val_batch_fn) # Initialize our trainer trainer = Trainer( diff --git a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py index ba6b372f09..2b3f53a170 100644 --- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py +++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py @@ -18,9 +18,10 @@ import argparse import json import math import os -import random +from functools import partial from pathlib import Path +import albumentations as A import datasets import evaluate import numpy as np @@ -28,12 +29,10 @@ import torch from accelerate import Accelerator from accelerate.logging import get_logger from accelerate.utils import set_seed +from albumentations.pytorch import ToTensorV2 from datasets import load_dataset from huggingface_hub import HfApi, hf_hub_download -from PIL import Image from torch.utils.data import DataLoader -from torchvision import transforms -from torchvision.transforms import functional from tqdm.auto import tqdm import transformers @@ -57,123 +56,23 @@ logger = get_logger(__name__) require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt") -def pad_if_smaller(img, size, fill=0): - min_size = min(img.size) - if min_size < size: - original_width, original_height = img.size - pad_height = size - original_height if original_height < size else 0 - pad_width = size - original_width if original_width < size else 0 - img = functional.pad(img, (0, 0, pad_width, pad_height), fill=fill) - return img +def reduce_labels_transform(labels: np.ndarray, **kwargs) -> np.ndarray: + """Set `0` label as with value 255 and then reduce all other labels by 1. + Example: + Initial class labels: 0 - background; 1 - road; 2 - car; + Transformed class labels: 255 - background; 0 - road; 1 - car; -class Compose: - def __init__(self, transforms): - self.transforms = transforms - - def __call__(self, image, target): - for t in self.transforms: - image, target = t(image, target) - return image, target - - -class Identity: - def __init__(self): - pass - - def __call__(self, image, target): - return image, target - - -class Resize: - def __init__(self, size): - self.size = size - - def __call__(self, image, target): - image = functional.resize(image, self.size) - target = functional.resize(target, self.size, interpolation=transforms.InterpolationMode.NEAREST) - return image, target - - -class RandomResize: - def __init__(self, min_size, max_size=None): - self.min_size = min_size - if max_size is None: - max_size = min_size - self.max_size = max_size - - def __call__(self, image, target): - size = random.randint(self.min_size, self.max_size) - image = functional.resize(image, size) - target = functional.resize(target, size, interpolation=transforms.InterpolationMode.NEAREST) - return image, target - - -class RandomCrop: - def __init__(self, size): - self.size = size - - def __call__(self, image, target): - image = pad_if_smaller(image, self.size) - target = pad_if_smaller(target, self.size, fill=255) - crop_params = transforms.RandomCrop.get_params(image, (self.size, self.size)) - image = functional.crop(image, *crop_params) - target = functional.crop(target, *crop_params) - return image, target - - -class RandomHorizontalFlip: - def __init__(self, flip_prob): - self.flip_prob = flip_prob - - def __call__(self, image, target): - if random.random() < self.flip_prob: - image = functional.hflip(image) - target = functional.hflip(target) - return image, target - - -class PILToTensor: - def __call__(self, image, target): - image = functional.pil_to_tensor(image) - target = torch.as_tensor(np.array(target), dtype=torch.int64) - return image, target - - -class ConvertImageDtype: - def __init__(self, dtype): - self.dtype = dtype - - def __call__(self, image, target): - image = functional.convert_image_dtype(image, self.dtype) - return image, target - - -class Normalize: - def __init__(self, mean, std): - self.mean = mean - self.std = std - - def __call__(self, image, target): - image = functional.normalize(image, mean=self.mean, std=self.std) - return image, target - - -class ReduceLabels: - def __call__(self, image, target): - if not isinstance(target, np.ndarray): - target = np.array(target).astype(np.uint8) - # avoid using underflow conversion - target[target == 0] = 255 - target = target - 1 - target[target == 254] = 255 - - target = Image.fromarray(target) - return image, target + **kwargs are required to use this function with albumentations. + """ + labels[labels == 0] = 255 + labels = labels - 1 + labels[labels == 254] = 255 + return labels def parse_args(): - parser = argparse.ArgumentParser(description="Finetune a transformers model on a text classification task") + parser = argparse.ArgumentParser(description="Finetune a transformers model on a image semantic segmentation task") parser.add_argument( "--model_name_or_path", type=str, @@ -418,69 +317,58 @@ def main(): model = AutoModelForSemanticSegmentation.from_pretrained( args.model_name_or_path, config=config, trust_remote_code=args.trust_remote_code ) + # `reduce_labels` is a property of dataset labels, in case we use image_processor + # pretrained on another dataset we should override the default setting + image_processor.do_reduce_labels = args.reduce_labels - # Preprocessing the datasets - # Define torchvision transforms to be applied to each image + target. - # Not that straightforward in torchvision: https://github.com/pytorch/vision/issues/9 - # Currently based on official torchvision references: https://github.com/pytorch/vision/blob/main/references/segmentation/transforms.py + # Define transforms to be applied to each image and target. if "shortest_edge" in image_processor.size: # We instead set the target size as (shortest_edge, shortest_edge) to here to ensure all images are batchable. - size = (image_processor.size["shortest_edge"], image_processor.size["shortest_edge"]) + height, width = image_processor.size["shortest_edge"], image_processor.size["shortest_edge"] else: - size = (image_processor.size["height"], image_processor.size["width"]) - train_transforms = Compose( + height, width = image_processor.size["height"], image_processor.size["width"] + train_transforms = A.Compose( [ - ReduceLabels() if args.reduce_labels else Identity(), - RandomCrop(size=size), - RandomHorizontalFlip(flip_prob=0.5), - PILToTensor(), - ConvertImageDtype(torch.float), - Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + A.Lambda(name="reduce_labels", mask=reduce_labels_transform if args.reduce_labels else None, p=1.0), + # pad image with 255, because it is ignored by loss + A.PadIfNeeded(min_height=height, min_width=width, border_mode=0, value=255, p=1.0), + A.RandomCrop(height=height, width=width, p=1.0), + A.HorizontalFlip(p=0.5), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std, max_pixel_value=255.0, p=1.0), + ToTensorV2(), ] ) - # Define torchvision transform to be applied to each image. - # jitter = ColorJitter(brightness=0.25, contrast=0.25, saturation=0.25, hue=0.1) - val_transforms = Compose( + val_transforms = A.Compose( [ - ReduceLabels() if args.reduce_labels else Identity(), - Resize(size=size), - PILToTensor(), - ConvertImageDtype(torch.float), - Normalize(mean=image_processor.image_mean, std=image_processor.image_std), + A.Lambda(name="reduce_labels", mask=reduce_labels_transform if args.reduce_labels else None, p=1.0), + A.Resize(height=height, width=width, p=1.0), + A.Normalize(mean=image_processor.image_mean, std=image_processor.image_std, max_pixel_value=255.0, p=1.0), + ToTensorV2(), ] ) - def preprocess_train(example_batch): + def preprocess_batch(example_batch, transforms: A.Compose): pixel_values = [] labels = [] for image, target in zip(example_batch["image"], example_batch["label"]): - image, target = train_transforms(image.convert("RGB"), target) - pixel_values.append(image) - labels.append(target) + transformed = transforms(image=np.array(image.convert("RGB")), mask=np.array(target)) + pixel_values.append(transformed["image"]) + labels.append(transformed["mask"]) encoding = {} - encoding["pixel_values"] = torch.stack(pixel_values) - encoding["labels"] = torch.stack(labels) + encoding["pixel_values"] = torch.stack(pixel_values).to(torch.float) + encoding["labels"] = torch.stack(labels).to(torch.long) return encoding - def preprocess_val(example_batch): - pixel_values = [] - labels = [] - for image, target in zip(example_batch["image"], example_batch["label"]): - image, target = val_transforms(image.convert("RGB"), target) - pixel_values.append(image) - labels.append(target) - - encoding = {} - encoding["pixel_values"] = torch.stack(pixel_values) - encoding["labels"] = torch.stack(labels) - - return encoding + # Preprocess function for dataset should have only one input argument, + # so we use partial to pass transforms + preprocess_train_batch_fn = partial(preprocess_batch, transforms=train_transforms) + preprocess_val_batch_fn = partial(preprocess_batch, transforms=val_transforms) with accelerator.main_process_first(): - train_dataset = dataset["train"].with_transform(preprocess_train) - eval_dataset = dataset["validation"].with_transform(preprocess_val) + train_dataset = dataset["train"].with_transform(preprocess_train_batch_fn) + eval_dataset = dataset["validation"].with_transform(preprocess_val_batch_fn) train_dataloader = DataLoader( train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=args.per_device_train_batch_size @@ -726,7 +614,7 @@ def main(): f"eval_{k}": v.tolist() if isinstance(v, np.ndarray) else v for k, v in eval_metrics.items() } with open(os.path.join(args.output_dir, "all_results.json"), "w") as f: - json.dump(all_results, f) + json.dump(all_results, f, indent=2) if __name__ == "__main__": From bb76f81e4036b49734d1ecbefb98945164ac9c07 Mon Sep 17 00:00:00 2001 From: Marc Sun <57196510+SunMarc@users.noreply.github.com> Date: Wed, 10 Apr 2024 11:51:06 +0200 Subject: [PATCH 067/110] [CI] Quantization workflow fix (#30158) * fix workflow * call ci * Update .github/workflows/self-scheduled-caller.yml Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --------- Co-authored-by: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> --- .github/workflows/self-scheduled.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 62edf20b64..3590bb9f84 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -313,6 +313,7 @@ jobs: run_tests_quantization_torch_gpu: if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} name: " " + needs: setup strategy: fail-fast: false matrix: From 185463784e0a0b4cd7974ce5bded7a52ae170f6d Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Wed, 10 Apr 2024 20:46:39 +0800 Subject: [PATCH 068/110] [tests] make 2 tests device-agnostic (#30008) add torch device --- tests/models/blip_2/test_modeling_blip_2.py | 4 ++-- tests/test_modeling_utils.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/models/blip_2/test_modeling_blip_2.py b/tests/models/blip_2/test_modeling_blip_2.py index ccf3051a17..984d432a36 100644 --- a/tests/models/blip_2/test_modeling_blip_2.py +++ b/tests/models/blip_2/test_modeling_blip_2.py @@ -992,7 +992,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase): # prepare image image = prepare_img() - inputs = processor(images=image, return_tensors="pt").to(0, dtype=torch.float16) + inputs = processor(images=image, return_tensors="pt").to(f"{torch_device}:0", dtype=torch.float16) predictions = model.generate(**inputs) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() @@ -1003,7 +1003,7 @@ class Blip2ModelIntegrationTest(unittest.TestCase): # image and context prompt = "Question: which city is this? Answer:" - inputs = processor(images=image, text=prompt, return_tensors="pt").to(0, dtype=torch.float16) + inputs = processor(images=image, text=prompt, return_tensors="pt").to(f"{torch_device}:0", dtype=torch.float16) predictions = model.generate(**inputs) generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip() diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py index e6f57d68cc..b6c1e99737 100755 --- a/tests/test_modeling_utils.py +++ b/tests/test_modeling_utils.py @@ -776,7 +776,7 @@ class ModelUtilsTest(TestCasePlus): tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") inputs = tokenizer("Hello, my name is", return_tensors="pt") - output = model.generate(inputs["input_ids"].to(0)) + output = model.generate(inputs["input_ids"].to(f"{torch_device}:0")) text_output = tokenizer.decode(output[0].tolist()) self.assertEqual(text_output, "Hello, my name is John. I'm a writer, and I'm a writer. I'm") From b7d002bdff3646cfd55f120b2b9e1b065d54fae5 Mon Sep 17 00:00:00 2001 From: Matthew Hoffman Date: Wed, 10 Apr 2024 08:42:00 -0500 Subject: [PATCH 069/110] Add str to TrainingArguments report_to type hint (#30078) * Add str to TrainingArguments report_to type hint * Swap order in Union * Merge Optional into Union https://github.com/huggingface/transformers/pull/30078#issuecomment-2042227546 --- src/transformers/training_args.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py index 694c142437..cdf6325c4b 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py @@ -1181,7 +1181,7 @@ class TrainingArguments: default="length", metadata={"help": "Column name with precomputed lengths to use when grouping by length."}, ) - report_to: Optional[List[str]] = field( + report_to: Union[None, str, List[str]] = field( default=None, metadata={"help": "The list of integrations to report the results and logs to."} ) ddp_find_unused_parameters: Optional[bool] = field( From 50c1c19fc7cdb5fe94697131d7ee04253596e994 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 10 Apr 2024 15:47:17 +0200 Subject: [PATCH 070/110] [UDOP] Fix tests (#29573) * Fix tests * Fix tests * Remove no_split_modules --- src/transformers/models/udop/modeling_udop.py | 1 - tests/models/udop/test_modeling_udop.py | 27 +++++++++++++------ tests/models/udop/test_processor_udop.py | 2 +- 3 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 7e242c8b52..0d817a324b 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -413,7 +413,6 @@ class UdopPreTrainedModel(PreTrainedModel): config_class = UdopConfig base_model_prefix = "transformer" supports_gradient_checkpointing = True - _no_split_modules = ["UdopBlock"] _keep_in_fp32_modules = ["wo"] def _init_weights(self, module): diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 7041f25f4e..257f6245ee 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -226,6 +226,20 @@ class UdopModelTester: ) self.parent.assertTrue(torch.all(output_with_past_cache == output_without_past_cache)) + def create_and_check_model_fp16_forward( + self, + config, + input_ids, + bbox, + decoder_input_ids, + attention_mask, + decoder_attention_mask, + lm_labels, + ): + model = UdopForConditionalGeneration(config=config).to(torch_device).half().eval() + output = model(input_ids, bbox=bbox, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids).logits + self.parent.assertFalse(torch.isnan(output).any().item()) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -268,6 +282,7 @@ class UdopModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): test_resize_embeddings = True test_model_parallel = False is_encoder_decoder = True + test_cpu_offload = False # The small UDOP model needs higher percentages for CPU/MP tests model_split_percents = [0.8, 0.9] @@ -491,10 +506,11 @@ class UdopEncoderOnlyModelTester: self, config, input_ids, + bbox, attention_mask, ): model = UdopEncoderModel(config=config).to(torch_device).half().eval() - output = model(input_ids, attention_mask=attention_mask)["last_hidden_state"] + output = model(input_ids, bbox=bbox, attention_mask=attention_mask)["last_hidden_state"] self.parent.assertFalse(torch.isnan(output).any().item()) @@ -504,7 +520,7 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase): test_torchscript = False test_head_masking = False test_resize_embeddings = False - test_model_parallel = True + test_model_parallel = False all_parallelizable_model_classes = (UdopEncoderModel,) if is_torch_available() else () def setUp(self): @@ -518,11 +534,6 @@ class UdopEncoderOnlyModelTest(ModelTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) - @unittest.skipIf(torch_device == "cpu", "Cant do half precision") - def test_model_fp16_forward(self): - config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model_fp16_forward(*config_and_inputs) - @unittest.skip( "Not currently compatible. Fails with - NotImplementedError: Cannot copy out of meta tensor; no data!" ) @@ -558,7 +569,7 @@ class UdopModelIntegrationTests(unittest.TestCase): model = self.model prompt = "Question answering. In which year is the report made?" - encoding = processor(images=self.image, text=prompt, return_tensors="pt") + encoding = processor(images=self.image, text=prompt, return_tensors="pt").to(torch_device) predicted_ids = model.generate(**encoding) diff --git a/tests/models/udop/test_processor_udop.py b/tests/models/udop/test_processor_udop.py index 53a50b9a11..ceb5f1e331 100644 --- a/tests/models/udop/test_processor_udop.py +++ b/tests/models/udop/test_processor_udop.py @@ -286,7 +286,7 @@ class UdopProcessorIntegrationTests(unittest.TestCase): # verify input_ids # this was obtained with Tesseract 4.1.1 # fmt: off - expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223" # noqa: E231 + expected_decoding = "7 ITC Limited REPORT AND ACCOUNTS 2013 ITC’s Brands: An Asset for the Nation The consumer needs and aspirations they fulfil, the benefit they generate for millions across ITC’s value chains, the future-ready capabilities that support them, and the value that they create for the country, have made ITC’s brands national assets, adding to India’s competitiveness. It is ITC’s aspiration to be the No 1 FMCG player in the country, driven by its new FMCG businesses. A recent Nielsen report has highlighted that ITC's new FMCG businesses are the fastest growing among the top consumer goods companies operating in India. ITC takes justifiable pride that, along with generating economic value, these celebrated Indian brands also drive the creation of larger societal capital through the virtuous cycle of sustainable and inclusive growth. DI WILLS * ; LOVE DELIGHTFULLY SOFT SKIN? aia Ans Source: https://www.industrydocuments.ucsf.edu/docs/snbx0223" # noqa: E231 # fmt: on decoding = processor.decode(input_processor.input_ids[1].tolist()) self.assertSequenceEqual(decoding, expected_decoding) From 505854f78f61c5254c75e3ffbcf3d7c7fadf65fb Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:02:50 +0200 Subject: [PATCH 071/110] [UDOP] Improve docs, add resources (#29571) * Improve docs * Add more tips --- docs/source/en/model_doc/udop.md | 17 ++++++++++++++--- src/transformers/models/udop/modeling_udop.py | 17 +++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/docs/source/en/model_doc/udop.md b/docs/source/en/model_doc/udop.md index b84ec160f7..614bd2ff4f 100644 --- a/docs/source/en/model_doc/udop.md +++ b/docs/source/en/model_doc/udop.md @@ -56,14 +56,25 @@ image = Image.open(name_of_your_document).convert("RGB") width, height = image.size ``` +One can use [`UdopProcessor`] to prepare images and text for the model, which takes care of all of this. By default, this class uses the Tesseract engine to extract a list of words and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either `apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used. Refer to the [usage guide of LayoutLMv2](layoutlmv2#usage-layoutlmv2processor) regarding all possible use cases (the functionality of `UdopProcessor` is identical). + +- If using an own OCR engine of choice, one recommendation is Azure's [Read API](https://learn.microsoft.com/en-us/azure/ai-services/computer-vision/how-to/call-read-api), which supports so-called line segments. Use of segment position embeddings typically results in better performance. - At inference time, it's recommended to use the `generate` method to autoregressively generate text given a document image. -- One can use [`UdopProcessor`] to prepare images and text for the model. By default, this class uses the Tesseract engine to extract a list of words -and boxes (coordinates) from a given document. Its functionality is equivalent to that of [`LayoutLMv3Processor`], hence it supports passing either -`apply_ocr=False` in case you prefer to use your own OCR engine or `apply_ocr=True` in case you want the default OCR engine to be used. +- The model has been pre-trained on both self-supervised and supervised objectives. One can use the various task prefixes (prompts) used during pre-training to test out the out-of-the-box capabilities. For instance, the model can be prompted with "Question answering. What is the date?", as "Question answering." is the task prefix used during pre-training for DocVQA. Refer to the [paper](https://arxiv.org/abs/2212.02623) (table 1) for all task prefixes. +- One can also fine-tune [`UdopEncoderModel`], which is the encoder-only part of UDOP, which can be seen as a LayoutLMv3-like Transformer encoder. For discriminative tasks, one can just add a linear classifier on top of it and fine-tune it on a labeled dataset. This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/microsoft/UDOP). +## Resources + +A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with UDOP. If +you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll +review it! The resource should ideally demonstrate something new instead of duplicating an existing resource. + +- Demo notebooks regarding UDOP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/UDOP) that show how +to fine-tune UDOP on a custom dataset as well as inference. 🌎 +- [Document question answering task guide](../tasks/document_question_answering) ## UdopConfig diff --git a/src/transformers/models/udop/modeling_udop.py b/src/transformers/models/udop/modeling_udop.py index 0d817a324b..9d12d9cc2e 100644 --- a/src/transformers/models/udop/modeling_udop.py +++ b/src/transformers/models/udop/modeling_udop.py @@ -1597,9 +1597,14 @@ class UdopModel(UdopPreTrainedModel): >>> from datasets import load_dataset >>> import torch + >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = AutoModel.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"] @@ -1772,14 +1777,21 @@ class UdopForConditionalGeneration(UdopPreTrainedModel): >>> from datasets import load_dataset >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = UdopForConditionalGeneration.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"] >>> words = example["tokens"] >>> boxes = example["bboxes"] + + >>> # one can use the various task prefixes (prompts) used during pre-training + >>> # e.g. the task prefix for DocVQA is "Question answering. " >>> question = "Question answering. What is the date on the form?" >>> encoding = processor(image, question, words, boxes=boxes, return_tensors="pt") @@ -1992,9 +2004,14 @@ class UdopEncoderModel(UdopPreTrainedModel): >>> from huggingface_hub import hf_hub_download >>> from datasets import load_dataset + >>> # load model and processor + >>> # in this case, we already have performed OCR ourselves + >>> # so we initialize the processor with `apply_ocr=False` >>> processor = AutoProcessor.from_pretrained("microsoft/udop-large", apply_ocr=False) >>> model = UdopEncoderModel.from_pretrained("microsoft/udop-large") + >>> # load an example image, along with the words and coordinates + >>> # which were extracted using an OCR engine >>> dataset = load_dataset("nielsr/funsd-layoutlmv3", split="train") >>> example = dataset[0] >>> image = example["image"] From 0f94e3e152dcc172dbd681aa0d0dc527d9338b8d Mon Sep 17 00:00:00 2001 From: Anton Vlasjuk <73884904+vasqu@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:36:43 +0200 Subject: [PATCH 072/110] Fix accelerate kwargs for versions <0.28.0 (#30086) * fix learning rate display issue in galore optimizer * fix kwarg in accelerate when using versions < 0.28.0 * this was supposed to be in the other PR whoops --- src/transformers/trainer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index f6e80ebafe..292ecad383 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -4374,8 +4374,9 @@ class Trainer: even_batches=accelerator_config.pop("even_batches"), use_seedable_sampler=accelerator_config.pop("use_seedable_sampler"), ) - # this would have been updated above, no need for it anymore - accelerator_config.pop("gradient_accumulation_kwargs") + # this would have been updated above, no need for it anymore + accelerator_config.pop("gradient_accumulation_kwargs") + args = { "deepspeed_plugin": self.args.deepspeed_plugin, "gradient_accumulation_plugin": gradient_accumulation_plugin, From 33bca5419c0e08b7b9e58d26a57ececcda0c9fa9 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Wed, 10 Apr 2024 22:58:56 +0800 Subject: [PATCH 073/110] Fix typing annotation in hf_argparser (#30156) --- src/transformers/hf_argparser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 3457058874..196035a4d1 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -376,7 +376,7 @@ class HfArgumentParser(ArgumentParser): raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}") return tuple(outputs) - def parse_json_file(self, json_file: str, allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: + def parse_json_file(self, json_file: Union[str, Path], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. @@ -398,7 +398,7 @@ class HfArgumentParser(ArgumentParser): outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys) return tuple(outputs) - def parse_yaml_file(self, yaml_file: str, allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: + def parse_yaml_file(self, yaml_file: Union[str, Path], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the dataclass types. From 0fe44059aed104b1a001b98fbf57332c866bf499 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 10 Apr 2024 16:59:13 +0200 Subject: [PATCH 074/110] Add recurrent gemma (#30143) * Fork. * RecurrentGemma initial commit. * Updating __init__.py. * Minor modification to how we initialize the cache. Changing how the config specifies the architecture. * Reformat code to 4 spaces. Fixed a few typos. * Fixed the forward pass. Still unclear on the cache? * Fixed the RecurrentGemmaForCausalLM * Minor comment that we might not need attention_mask and output_attention arguments. * Now cache should work as well. * Adding a temporary example to check whether the model generation works. * Adding the tests and updating imports. * Adding the example file missing in the previous commit. * First working example. * Removing .gitignore and reverting parts of __init__. * Re-add .gitignore. * Addressing comments for configuration. * Move mask creation to `_prepare_inputs_for_generation`. * First try at integration tests: 1. AttributeError: 'GriffinCausalLMOutput' object has no attribute 'attentions'. 2. `cache_position` not passed * Transfoering between machines. * Running normal tests. * Minor fix. * More fixes. * Addressing more comments. * Minor fixes. * first stab at cleanup * more refactoring * fix copies and else * renaming and get init to work * fix causal mask creation * update * nit * fix a hell lot of things * updates * update conversion script * make all keys importable * nits * add auto mappings * properly convert ffw_up and down * add scaling * fix generations * for recurrent dtype * update * fix going beyong window * fixup * add missing files * current updates to remove last einops * finish modeling refactor * TADA * fix compile * fix most failing testt ? ? * update tests * refactor and update * update * nits, fixup and update tests * more fixup * nits * fix imports * test format * fixups * nits * tuple typing * fix code quality * add model card * fix doc * skip most generation tests * nits * style * doc fixes * fix pr and check_copies? * last nit * oupsy * Apply suggestions from code review Co-authored-by: Lysandre Debut * update * Update src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * update based on review * doc nit * fix quality * quality * fix slow test model path * update default dype * ignore attributes that can be safely ignored in check config attributes * 0lallalala come on * save nit * style * remove to dict update * make sure we can also run in float16 * style --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Co-authored-by: Aleksandar Botev Co-authored-by: Leonard Berrada Co-authored-by: anushanf Co-authored-by: botev Co-authored-by: Lysandre Debut Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- README.md | 1 + README_de.md | 1 + README_es.md | 1 + README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_vi.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/recurrent_gemma.md | 48 + docs/source/en/tasks/language_modeling.md | 2 +- src/transformers/__init__.py | 14 + src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/auto/tokenization_auto.py | 7 + .../models/recurrent_gemma/__init__.py | 59 ++ .../configuration_recurrent_gemma.py | 158 +++ .../convert_recurrent_gemma_to_hf.py | 222 +++++ .../modeling_recurrent_gemma.py | 938 ++++++++++++++++++ src/transformers/utils/dummy_pt_objects.py | 21 + tests/models/recurrent_gemma/__init__.py | 0 .../test_modeling_recurrent_gemma.py | 508 ++++++++++ utils/check_config_attributes.py | 2 + utils/check_repo.py | 1 + utils/not_doctested.txt | 1 + 32 files changed, 2001 insertions(+), 1 deletion(-) create mode 100644 docs/source/en/model_doc/recurrent_gemma.md create mode 100644 src/transformers/models/recurrent_gemma/__init__.py create mode 100644 src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py create mode 100644 src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py create mode 100644 src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py create mode 100644 tests/models/recurrent_gemma/__init__.py create mode 100644 tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py diff --git a/README.md b/README.md index 8518fc09dc..bb06321f12 100644 --- a/README.md +++ b/README.md @@ -476,6 +476,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_de.md b/README_de.md index 66fae98706..3eb2c63b03 100644 --- a/README_de.md +++ b/README_de.md @@ -472,6 +472,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_es.md b/README_es.md index e4f4dedc3e..202411bbf8 100644 --- a/README_es.md +++ b/README_es.md @@ -449,6 +449,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_fr.md b/README_fr.md index c2da27829e..7d6d2e765a 100644 --- a/README_fr.md +++ b/README_fr.md @@ -470,6 +470,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (de l'équipe Qwen, Alibaba Group) a été publié avec le rapport technique [blog post](https://qwenlm.github.io/blog/qwen-moe/) par Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (de Facebook) a été publié dans l'article [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) par Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (de Google Research) a été publié dans l'article [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) par Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat et Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (de Google) publié dans l'article [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) parthe Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (de Google Research) a été publié dans l'article [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) par Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (de META Platforms) a été publié dans l'article [Designing Network Design Space](https://arxiv.org/abs/2003.13678) par Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (de Google Research) a été publié dans l'article [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) par Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_hd.md b/README_hd.md index a5bd56ee1c..33ae62baf0 100644 --- a/README_hd.md +++ b/README_hd.md @@ -423,6 +423,7 @@ conda install conda-forge::transformers 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group से) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. द्वाराअनुसंधान पत्र [blog post](https://qwenlm.github.io/blog/qwen-moe/) के साथ जारी किया गया 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (फेसबुक से) साथ में कागज [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) पैट्रिक लुईस, एथन पेरेज़, अलेक्जेंड्रा पिक्टस, फैबियो पेट्रोनी, व्लादिमीर कारपुखिन, नमन गोयल, हेनरिक कुटलर, माइक लुईस, वेन-ताउ यिह, टिम रॉकटाशेल, सेबस्टियन रिडेल, डौवे कीला द्वारा। 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google अनुसंधान से) केल्विन गु, केंटन ली, ज़ोरा तुंग, पानुपोंग पसुपत और मिंग-वेई चांग द्वारा साथ में दिया गया पेपर [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909)। +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (Google से) the Griffin, RLHF and Gemma Teams. द्वाराअनुसंधान पत्र [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) के साथ जारी किया गया 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META रिसर्च से) [Designing Network Design Space](https://arxiv.org/abs/2003.13678) पेपर के साथ जारी किया गया एब्स/2003.13678) इलिजा राडोसावोविक, राज प्रतीक कोसाराजू, रॉस गिर्शिक, कैमिंग ही, पिओटर डॉलर द्वारा। 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (गूगल रिसर्च से) साथ वाला पेपर [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) ह्युंग वोन चुंग, थिबॉल्ट फ़ेवरी, हेनरी त्साई, एम. जॉनसन, सेबेस्टियन रुडर द्वारा। diff --git a/README_ja.md b/README_ja.md index e42a5680a7..dbe2cecdbd 100644 --- a/README_ja.md +++ b/README_ja.md @@ -483,6 +483,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group から) Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. から公開された研究論文 [blog post](https://qwenlm.github.io/blog/qwen-moe/) 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook から) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela から公開された研究論文: [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research から) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang から公開された研究論文: [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (Google から) the Griffin, RLHF and Gemma Teams. から公開された研究論文 [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research から) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya から公開された研究論文: [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Platforms から) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár から公開された研究論文: [Designing Network Design Space](https://arxiv.org/abs/2003.13678) 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research から) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder から公開された研究論文: [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) diff --git a/README_ko.md b/README_ko.md index 95cb1b0b79..547572c4f2 100644 --- a/README_ko.md +++ b/README_ko.md @@ -398,6 +398,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (the Qwen team, Alibaba Group 에서 제공)은 Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou.의 [blog post](https://qwenlm.github.io/blog/qwen-moe/)논문과 함께 발표했습니다. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (Facebook 에서) Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 의 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 논문과 함께 발표했습니다. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (Google Research 에서) Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 의 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 논문과 함께 발표했습니다. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (Google 에서 제공)은 the Griffin, RLHF and Gemma Teams.의 [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf)논문과 함께 발표했습니다. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (Google Research 에서) Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 의 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 논문과 함께 발표했습니다. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (META Research 에서) Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár 의 [Designing Network Design Space](https://arxiv.org/abs/2003.13678) 논문과 함께 발표했습니다. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (Google Research 에서) Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 의 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) 논문과 함께 발표했습니다. diff --git a/README_pt-br.md b/README_pt-br.md index 7d10ce5e8c..ad3e6bb90d 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -481,6 +481,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_ru.md b/README_ru.md index 03cc1b919e..7c5cb5f179 100644 --- a/README_ru.md +++ b/README_ru.md @@ -471,6 +471,7 @@ conda install conda-forge::transformers 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_te.md b/README_te.md index fa762d5659..90a443e434 100644 --- a/README_te.md +++ b/README_te.md @@ -473,6 +473,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_vi.md b/README_vi.md index e7cad3b364..98114bf0a8 100644 --- a/README_vi.md +++ b/README_vi.md @@ -472,6 +472,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (từ the Qwen team, Alibaba Group) được phát hành với bài báo [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (từ Facebook) được phát hành với bài báo [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (từ Google Research) được phát hành với bài báo [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (từ Google) được phát hành với bài báo [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (từ Google Research) được phát hành với bài báo [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (từ META Platforms) được phát hành với bài báo [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (từ Google Research) được phát hành với bài báo [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/README_zh-hans.md b/README_zh-hans.md index 8ac4d5c388..bd7457b25f 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -422,6 +422,7 @@ conda install conda-forge::transformers 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (来自 the Qwen team, Alibaba Group) 伴随论文 [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou 发布. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (来自 Facebook) 伴随论文 [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) 由 Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela 发布。 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (来自 Google Research) 伴随论文 [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) 由 Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang 发布。 +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (来自 Google) 伴随论文 [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) 由 the Griffin, RLHF and Gemma Teams 发布。 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 0e31d8e6b5..1eb46fe700 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -434,6 +434,7 @@ conda install conda-forge::transformers 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. 1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela. 1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang. +1. **[RecurrentGemma](https://huggingface.co/docs/transformers/main/model_doc/recurrent-gemma)** (from Google) released with the paper [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams. 1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya. 1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Research) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár. 1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index af44de4d10..7daf91c99d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -468,6 +468,8 @@ title: RAG - local: model_doc/realm title: REALM + - local: model_doc/recurrent_gemma + title: RecurrentGemma - local: model_doc/reformer title: Reformer - local: model_doc/rembert diff --git a/docs/source/en/index.md b/docs/source/en/index.md index ffa9ae3f4b..9c5c87d00f 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -243,6 +243,7 @@ Flax), PyTorch, and/or TensorFlow. | [Qwen2MoE](model_doc/qwen2_moe) | ✅ | ❌ | ❌ | | [RAG](model_doc/rag) | ✅ | ✅ | ❌ | | [REALM](model_doc/realm) | ✅ | ❌ | ❌ | +| [RecurrentGemma](model_doc/recurrent_gemma) | ✅ | ❌ | ❌ | | [Reformer](model_doc/reformer) | ✅ | ❌ | ❌ | | [RegNet](model_doc/regnet) | ✅ | ✅ | ✅ | | [RemBERT](model_doc/rembert) | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md new file mode 100644 index 0000000000..35a8ce9e3a --- /dev/null +++ b/docs/source/en/model_doc/recurrent_gemma.md @@ -0,0 +1,48 @@ + + +# RecurrentGemma + +## Overview + +The Recurrent Gemma model was proposed in [RecurrentGemma: Moving Past Transformers for Efficient Open Language Models](https://storage.googleapis.com/deepmind-media/gemma/recurrentgemma-report.pdf) by the Griffin, RLHF and Gemma Teams of Google. + +The abstract from the paper is the following: + +*We introduce RecurrentGemma, an open language model which uses Google’s novel Griffin architecture. Griffin combines linear recurrences with local attention to achieve excellent performance on language. It has a fixed-sized state, which reduces memory use and enables efficient inference on long sequences. We provide a pre-trained model with 2B non-embedding parameters, and an instruction tuned variant. Both models achieve comparable performance to Gemma-2B despite being trained on fewer tokens.* + +Tips: + +- The original checkpoints can be converted using the conversion script `src/transformers/models/recurrent_gemma/convert_recurrent_gemma_weights_to_hf.py` + +This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). + + +## RecurrentGemmaConfig + +[[autodoc]] RecurrentGemmaConfig + + +## RecurrentGemmaModel + +[[autodoc]] RecurrentGemmaModel + - forward + +## RecurrentGemmaForCausalLM + +[[autodoc]] RecurrentGemmaForCausalLM + - forward + diff --git a/docs/source/en/tasks/language_modeling.md b/docs/source/en/tasks/language_modeling.md index e1858ef248..88a4519271 100644 --- a/docs/source/en/tasks/language_modeling.md +++ b/docs/source/en/tasks/language_modeling.md @@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the Choose one of the following architectures: -[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod) +[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [Cohere](../model_doc/cohere), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [Gemma](../model_doc/gemma), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Mamba](../model_doc/mamba), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [Mixtral](../model_doc/mixtral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MusicGen Melody](../model_doc/musicgen_melody), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [Phi](../model_doc/phi), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Qwen2](../model_doc/qwen2), [Qwen2MoE](../model_doc/qwen2_moe), [RecurrentGemma](../model_doc/recurrent_gemma), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [StableLm](../model_doc/stablelm), [Starcoder2](../model_doc/starcoder2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index da29d77972..06a6a0859b 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -743,6 +743,7 @@ _import_structure = { "RealmConfig", "RealmTokenizer", ], + "models.recurrent_gemma": ["RecurrentGemmaConfig"], "models.reformer": ["REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "ReformerConfig"], "models.regnet": ["REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "RegNetConfig"], "models.rembert": ["REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP", "RemBertConfig"], @@ -3115,6 +3116,13 @@ else: "load_tf_weights_in_realm", ] ) + _import_structure["models.recurrent_gemma"].extend( + [ + "RecurrentGemmaForCausalLM", + "RecurrentGemmaModel", + "RecurrentGemmaPreTrainedModel", + ] + ) _import_structure["models.reformer"].extend( [ "REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5625,6 +5633,7 @@ if TYPE_CHECKING: RealmConfig, RealmTokenizer, ) + from .models.recurrent_gemma import RecurrentGemmaConfig from .models.reformer import REFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, ReformerConfig from .models.regnet import REGNET_PRETRAINED_CONFIG_ARCHIVE_MAP, RegNetConfig from .models.rembert import REMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP, RemBertConfig @@ -7687,6 +7696,11 @@ if TYPE_CHECKING: RealmScorer, load_tf_weights_in_realm, ) + from .models.recurrent_gemma import ( + RecurrentGemmaForCausalLM, + RecurrentGemmaModel, + RecurrentGemmaPreTrainedModel, + ) from .models.reformer import ( REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST, ReformerAttention, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 0599d3b876..4a5cd01add 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -187,6 +187,7 @@ from . import ( qwen2_moe, rag, realm, + recurrent_gemma, reformer, regnet, rembert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index bf46066002..0d5d9ae62b 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -198,6 +198,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("qwen2_moe", "Qwen2MoeConfig"), ("rag", "RagConfig"), ("realm", "RealmConfig"), + ("recurrent_gemma", "RecurrentGemmaConfig"), ("reformer", "ReformerConfig"), ("regnet", "RegNetConfig"), ("rembert", "RemBertConfig"), @@ -471,6 +472,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("qwen2_moe", "Qwen2MoE"), ("rag", "RAG"), ("realm", "REALM"), + ("recurrent_gemma", "RecurrentGemma"), ("reformer", "Reformer"), ("regnet", "RegNet"), ("rembert", "RemBERT"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 150dea04f3..6f3d9d17a3 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -183,6 +183,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("qdqbert", "QDQBertModel"), ("qwen2", "Qwen2Model"), ("qwen2_moe", "Qwen2MoeModel"), + ("recurrent_gemma", "RecurrentGemmaModel"), ("reformer", "ReformerModel"), ("regnet", "RegNetModel"), ("rembert", "RemBertModel"), @@ -469,6 +470,7 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( ("qdqbert", "QDQBertLMHeadModel"), ("qwen2", "Qwen2ForCausalLM"), ("qwen2_moe", "Qwen2MoeForCausalLM"), + ("recurrent_gemma", "RecurrentGemmaForCausalLM"), ("reformer", "ReformerModelWithLMHead"), ("rembert", "RemBertForCausalLM"), ("roberta", "RobertaForCausalLM"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 4bc5d81053..af30469f9c 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -363,6 +363,13 @@ else: ), ("rag", ("RagTokenizer", None)), ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)), + ( + "recurrent_gemma", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), ( "reformer", ( diff --git a/src/transformers/models/recurrent_gemma/__init__.py b/src/transformers/models/recurrent_gemma/__init__.py new file mode 100644 index 0000000000..3ac7ff1c99 --- /dev/null +++ b/src/transformers/models/recurrent_gemma/__init__.py @@ -0,0 +1,59 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_torch_available, +) + + +_import_structure = { + "configuration_recurrent_gemma": ["RecurrentGemmaConfig"], +} + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_recurrent_gemma"] = [ + "RecurrentGemmaForCausalLM", + "RecurrentGemmaModel", + "RecurrentGemmaPreTrainedModel", + ] + + +if TYPE_CHECKING: + from .configuration_recurrent_gemma import RecurrentGemmaConfig + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_recurrent_gemma import ( + RecurrentGemmaForCausalLM, + RecurrentGemmaModel, + RecurrentGemmaPreTrainedModel, + ) + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py new file mode 100644 index 0000000000..f5a3f9673a --- /dev/null +++ b/src/transformers/models/recurrent_gemma/configuration_recurrent_gemma.py @@ -0,0 +1,158 @@ +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" RecurrentGemma model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class RecurrentGemmaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`RecurrentGemmaModel`]. It is used to instantiate a RecurrentGemma + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the RecurrentGemma-7B. + + e.g. [google/recurrentgemma-2b](https://huggingface.co/google/recurrentgemma-2b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + num_hidden_layers (`int`, *optional*, defaults to 26): + The number of hidden layers in the model. + vocab_size (`int`, *optional*, defaults to 256000): + Vocabulary size of the RecurrentGemma model. Defines the number of + different tokens that can be represented by the + `inputs_ids` passed when calling [`RecurrentGemmaModel`] + hidden_size (`int`, *optional*, defaults to 2560): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 7680): + Dimension of the MLP representations. + num_attention_heads (`int`, *optional*, defaults to 10): + The number of heads for the attention block and the number of + heads/blocks for the block-diagonal layers used in the RG-LRU gates. + This number must divide `hidden_size` and `lru_width`. + lru_width (`int` or `None`, *optional*): + Dimension of the hidden representations of the RG-LRU. If `None` + this will be set to `hidden_size`. + Whether to scale the output of the embeddings by `sqrt(hidden_size)`. + attention_window_size (`int`, *optional*, defaults to 2048): + The size of the attention window used in the attention block. + conv1d_width (`int`, *optional*, defaults to 4): + The kernel size of conv1d layers used in the recurrent blocks. + logits_soft_cap (`float`, *optional*, defaults to 30.0): + The value at which the logits should be soft-capped to after the transformer and LM-head computation in the Causal LM architecture. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether the model should return the last key/values + attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + hidden_activation (``str` or `function``, *optional*, defaults to `"gelu_pytorch_tanh"`): + The hidden activation used in the recurrent block as well as the MLP layer of the decoder layers. + partial_rotary_factor (`float`, *optional*, defaults to 0.5): + The partial rotary factor used in the initialization of the rotary embeddings. + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + block_types (`List[str]`, *optional*, defaults to `('recurrent', 'recurrent', 'attention')`): + List of aleternating blocks that will be repeated to initialize the `temporal_block` layer. + attention_dropout (`float`, *optional*, defaults to 0.0): dropout value to use after the attention softmax. + num_key_value_heads (`16`, *optional*, defaults to 16): Number of key value heads to use GQA. + attention_bias (`bool`, *optional*, defaults to `False`): whether or not the linear q,k,v of the Attention layer should have bias + w_init_variance_scale (`float`, *optional*, defaults to 0.01): weight initialization variance. + ```python + >>> from transformers import RecurrentGemmaModel, RecurrentGemmaConfig + + >>> # Initializing a RecurrentGemma recurrentgemma-2b style configuration + >>> configuration = RecurrentGemmaConfig() + + >>> # Initializing a model from the recurrentgemma-2b style configuration + >>> model = RecurrentGemmaModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "recurrent_gemma" + + def __init__( + self, + num_hidden_layers=26, + vocab_size=256000, + hidden_size=2560, + intermediate_size=3 * 2560, + num_attention_heads=10, + lru_width=None, + attention_window_size=2048, + conv1d_width=4, + logits_soft_cap=30.0, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + eos_token_id=1, + bos_token_id=2, + hidden_activation="gelu_pytorch_tanh", + partial_rotary_factor=0.5, + rope_theta=10000.0, + block_types=("recurrent", "recurrent", "attention"), + attention_dropout=0.0, + num_key_value_heads=None, + attention_bias=False, + w_init_variance_scale=0.01, + **kwargs, + ): + self.num_hidden_layers = num_hidden_layers + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.lru_width = lru_width if lru_width is not None else hidden_size + self.attention_window_size = attention_window_size + self.conv1d_width = conv1d_width + self.logits_soft_cap = logits_soft_cap + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.partial_rotary_factor = partial_rotary_factor + self.block_types = list(block_types) + self.hidden_activation = hidden_activation + self.head_dim = self.hidden_size // self.num_attention_heads + self.num_key_value_heads = num_key_value_heads if num_key_value_heads is not None else num_attention_heads + if self.num_key_value_heads > self.num_attention_heads: + raise ValueError("The number of `num_key_value_heads` must be smaller than `num_attention_heads`") + self.attention_dropout = attention_dropout + self.attention_bias = attention_bias + self.w_init_variance_scale = w_init_variance_scale + self.final_w_init_variance_scale = 2.0 / self.num_hidden_layers + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + **kwargs, + ) + + @property + def layers_block_type(self): + return (self.block_types * 100)[: self.num_hidden_layers] diff --git a/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py new file mode 100644 index 0000000000..dc6619e217 --- /dev/null +++ b/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py @@ -0,0 +1,222 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import os +import warnings + +import torch +from accelerate import init_empty_weights + +from transformers import GemmaTokenizer, RecurrentGemmaConfig, RecurrentGemmaForCausalLM + + +try: + from transformers import GemmaTokenizerFast +except ImportError as e: + warnings.warn(e) + warnings.warn( + "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion" + ) + GemmaTokenizerFast = None + +import regex as re + + +""" +Sample usage: + +``` +python src/transformers/models/gemma/convert_gemma_weights_to_hf.py \ + --input_dir /path/to/downloaded/gemma/weights --model_size 7B --output_dir /output/path +``` + +Thereafter, models can be loaded via: + +```py +from transformers import GemmaForCausalLM, GemmaTokenizerFast + +model = GemmaForCausalLM.from_pretrained("/output/path") +tokenizer = GemmaTokenizerFast.from_pretrained("/output/path") +``` + +Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). +""" + +gemma_2b_config = RecurrentGemmaConfig( + num_attention_heads=10, + num_key_value_heads=1, + hidden_size=2560, + intermediate_size=15360, + vocab_size=256000, + num_hidden_layers=26, +) + +gemma_7b_config = RecurrentGemmaConfig() + +CONFIG_MAPPING = {"2B": gemma_2b_config, "7B": gemma_7b_config} +LAYER_NAME_MAPPING = {"embedder.weight": "model.embed_tokens.weight"} + + +def write_model(save_path, input_base_path, config, safe_serialization=True, push_to_hub=False, dtype=torch.float32): + print(f"Fetching all parameters from the checkpoint at '{input_base_path}'") + model_state_dict = torch.load(input_base_path, map_location="cpu") + + REPLACEMENT = { + "blocks.": "layers.", + ".ffw_down.b": ".down_proj.b", + ".ffw_down.w": ".down_proj.w", + ".ffw_up.b": ".up_proj.bias", + ".ffw_up.w": ".up_proj.weight", + "recurrent_block": "temporal_block", + "attention_block": "temporal_block", + "temporal_block.proj_final": "temporal_block.out_proj", + "norm.scale": "norm.weight", + ".proj_k": ".k_proj", + ".proj_q": ".q_proj", + ".proj_v": ".v_proj", + ".proj_final": ".o_proj", + "embedder.input_embedding": "embed_tokens.weight", + "conv_1d.w": "conv_1d.weight", + "conv_1d.b": "conv_1d.bias", + "input_gate.w": "input_gate.weight", + "input_gate.b": "input_gate.bias", + "a_param": "recurrent_param", + "a_gate.b": "recurrent_gate.bias", + "a_gate.w": "recurrent_gate.weight", + } + + state_dict = {} + for k, v in model_state_dict.items(): + k = "model." + k + pattern = re.compile("|".join(map(re.escape, REPLACEMENT.keys()))) + key = pattern.sub(lambda match: REPLACEMENT[match.group(0)], k) + if "conv_1d.weight" in key: + v = v[:, None, :].transpose(0, 2) + if "up_proj.weight" in key: + state_dict[key.replace("up_proj", "gate_proj")] = v[0].T.contiguous() + v = v[1].T.contiguous() + if "up_proj.bias" in key: + state_dict[key.replace("up_proj", "gate_proj")] = v[0, 0, 0].clone() + v = v[1, 0, 0].contiguous() + if "recurrent_gate.bias" in key: + state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone() + elif "recurrent_gate.weight" in key: + state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone() + elif "input_gate.b" in key: + state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone() + elif "input_gate.w" in key: + state_dict[key.replace("gate.", "gate_")] = v.contiguous().clone() + elif "embed_tokens" in key: + state_dict[key] = v[: config.vocab_size, :].contiguous().clone() + state_dict["lm_head.weight"] = v[: config.vocab_size, :].contiguous().clone() + else: + state_dict[key] = v.contiguous() + + torch.set_default_dtype(dtype) + + print("Loading the checkpoint in a Gemma model.") + with init_empty_weights(): + model = RecurrentGemmaForCausalLM(config) + model.load_state_dict(state_dict, assign=True, strict=True) + + model.config.torch_dtype = torch.float32 + del model.config._name_or_path + print("Saving in the Transformers format.") + + if push_to_hub: + print(f"pushing the model to {save_path}") + else: + model.save_pretrained(save_path, safe_serialization=safe_serialization) + + +def write_tokenizer(input_tokenizer_path, save_path, push_to_hub=False): + # Initialize the tokenizer based on the `spm` model + tokenizer_class = GemmaTokenizer if GemmaTokenizerFast is None else GemmaTokenizerFast + print(f"Saving a {tokenizer_class.__name__} to {save_path}.") + tokenizer = tokenizer_class(input_tokenizer_path) + if push_to_hub: + tokenizer.push_to_hub(save_path) + else: + tokenizer.save_pretrained(save_path) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_checkpoint", + help="Absolute path to the target Gemma weights.", + default="/home/arthur/transformers_recurrentgemma/google/recurrent-gemma-2b-it/ToBeDeleted/2b-it.pt", + ) + parser.add_argument( + "--tokenizer_checkpoint", + help="Location of Gemma tokenizer model", + ) + parser.add_argument( + "--model_size", + default="2B", + choices=["2B", "7B", "tokenizer_only"], + help="'f' models correspond to the finetuned versions, and are specific to the Gemma2 official release. For more details on Gemma2, checkout the original repo: https://huggingface.co/google/gemma-7b", + ) + parser.add_argument( + "--output_dir", + default="google/recurrent-gemma-2b-it-hf", + help="Location to write HF model and tokenizer", + ) + parser.add_argument( + "--pickle_serialization", + help="Whether or not to save using `safetensors`.", + action="store_true", + default=False, + ) + parser.add_argument( + "--convert_tokenizer", + help="Whether or not to convert the tokenizer as well.", + action="store_true", + default=False, + ) + parser.add_argument( + "--push_to_hub", + help="Whether or not to push the model to the hub at `output_dir` instead of saving it locally.", + action="store_true", + default=False, + ) + parser.add_argument( + "--dtype", + default="float32", + help="Target dtype of the converted model", + ) + args = parser.parse_args() + + if args.convert_tokenizer: + if args.tokenizer_checkpoint is None: + raise ValueError("Path to the tokenizer is required when passing --convert_tokenizer") + + spm_path = os.path.join(args.tokenizer_checkpoint) + write_tokenizer(spm_path, args.output_dir, args.push_to_hub) + + config = CONFIG_MAPPING[args.model_size] + dtype = getattr(torch, args.dtype) + write_model( + config=config, + input_base_path=args.input_checkpoint, + save_path=args.output_dir, + safe_serialization=not args.pickle_serialization, + push_to_hub=args.push_to_hub, + dtype=dtype, + ) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py new file mode 100644 index 0000000000..26cdc437d0 --- /dev/null +++ b/src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py @@ -0,0 +1,938 @@ +# coding=utf-8 +# Copyright 2024 Google Inc. HuggingFace Inc. team. All rights reserved. +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch RecurrentGemma model.""" + +import math +from typing import Dict, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import BaseModelOutputWithNoAttention, CausalLMOutput +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ALL_LAYERNORM_LAYERS +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_recurrent_gemma import RecurrentGemmaConfig + + +logger = logging.get_logger(__name__) +_CONFIG_FOR_DOC = "RecurrentGemmaConfig" +_MAX_SQRT_GRADIENT = 1000.0 + + +# Copied from transformers.models.gemma.modeling_gemma.GemmaRMSNorm with Gemma->RecurrentGemma +class RecurrentGemmaRMSNorm(nn.Module): + def __init__(self, dim: int, eps: float = 1e-6): + super().__init__() + self.eps = eps + self.weight = nn.Parameter(torch.zeros(dim)) + + def _norm(self, x): + return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) + + def forward(self, x): + output = self._norm(x.float()) + # Llama does x.to(float16) * w whilst RecurrentGemma is (x * w).to(float16) + # See https://github.com/huggingface/transformers/pull/29402 + output = output * (1.0 + self.weight.float()) + return output.type_as(x) + + +ALL_LAYERNORM_LAYERS.append(RecurrentGemmaRMSNorm) + + +class RecurrentGemmaRotaryEmbedding(nn.Module): + def __init__(self, dim, base=10000, device=None): + super().__init__() + self.dim = dim + self.base = base + self.register_buffer("inv_freq", None, persistent=False) + + @torch.no_grad() + # Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding.forward with Gemma->RecurrentGemma + def forward(self, x, position_ids, seq_len=None): + # x: [bs, num_attention_heads, seq_len, head_size] + if self.inv_freq is None: + self.inv_freq = 1.0 / ( + self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim) + ) + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1) + position_ids_expanded = position_ids[:, None, :].float() + # Force float32 since bfloat16 loses precision on long contexts + # See https://github.com/huggingface/transformers/pull/29285 + device_type = x.device.type + device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() + sin = emb.sin() + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +# Copied from transformers.models.llama.modeling_llama.rotate_half +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +class RecurrentGemmaSdpaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: RecurrentGemmaConfig): + super().__init__() + self.config = config + self.attention_dropout = config.attention_dropout + self.hidden_size = config.hidden_size + self.num_attention_heads = config.num_attention_heads + self.head_dim = config.head_dim + self.num_key_value_heads = config.num_key_value_heads + self.num_key_value_groups = self.num_attention_heads // self.num_key_value_heads + self.partial_rotary_factor = config.partial_rotary_factor + + self.q_proj = nn.Linear(self.hidden_size, self.num_attention_heads * self.head_dim, bias=config.attention_bias) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias) + self.o_proj = nn.Linear(self.num_attention_heads * self.head_dim, self.hidden_size, bias=True) + self.rotary_emb = RecurrentGemmaRotaryEmbedding( + int(self.partial_rotary_factor * self.head_dim), + base=config.rope_theta, + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_attention_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + cos, sin = self.rotary_emb(value_states, position_ids, seq_len=None) + + # Partial rotary embedding + query_rot, query_pass = torch.chunk(query_states, int(1 / self.partial_rotary_factor), dim=-1) + key_rot, key_pass = torch.chunk(key_states, int(1 / self.partial_rotary_factor), dim=-1) + query_rot, key_rot = apply_rotary_pos_emb(query_rot, key_rot, cos, sin, position_ids) + query_states = torch.cat((query_rot, query_pass), dim=-1) + key_states = torch.cat((key_rot, key_pass), dim=-1) + + if use_cache and hasattr(self, "key_states"): + cache_kwargs = {"cache_position": cache_position} + key_states, value_states = self._update_cache(key_states, value_states, **cache_kwargs) + + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + causal_mask = attention_mask + if attention_mask is not None: + causal_mask = causal_mask[:, :, :, : key_states.shape[-2]] + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states.contiguous(), + key_states.contiguous(), + value_states.contiguous(), + attn_mask=causal_mask, # pretty much a must for sliding window backend! + dropout_p=self.attention_dropout if self.training else 0.0, + scale=self.head_dim**-0.5, + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.view(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + return attn_output + + def _setup_cache(self, batch_size, device, dtype=None): + if dtype is None and self.config.torch_dtype is not None: + dtype = self.config.torch_dtype + dtype = dtype if dtype is not None else torch.float32 + cache_shape = (batch_size, self.num_key_value_heads, self.config.attention_window_size, self.head_dim) + self.value_states = torch.zeros(cache_shape, dtype=dtype, device=device) + self.key_states = torch.zeros(cache_shape, dtype=dtype, device=device) + + @torch.no_grad() + def _update_cache(self, key_states, value_states, **cache_kwargs): + """ + torch.compile compatible sliding window. + Computes the `indices` based on `cache_position >= self.config.attention_window_size - 1`. + The `to_shift` is only true once we are above attention_window_size. Thus with `attention_window_size==64`: + + indices = (slicing + to_shift[-1].int()-1) % self.config.attention_window_size + tensor([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 0]) + + We overwrite the cache using these, then we always write at cache_position (clamped to `attention_window_size`) + """ + cache_position = cache_kwargs.get("cache_position") + if cache_position.shape[0] > self.config.attention_window_size: + # int indexing -> device sync? in compile, use tensor + k_out = key_states[:, :, -self.config.attention_window_size :, :] + v_out = value_states[:, :, -self.config.attention_window_size :, :] + else: + slicing = torch.ones( + self.config.attention_window_size, dtype=torch.long, device=value_states.device + ).cumsum(0) + cache_position = cache_position.clamp(0, self.config.attention_window_size - 1) + to_shift = cache_position >= self.config.attention_window_size - 1 + indices = (slicing + to_shift[-1].int() - 1) % self.config.attention_window_size + + k_out, v_out = self.key_states, self.value_states + k_out = k_out[:, :, indices] + v_out = v_out[:, :, indices] + + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + self.key_states, self.value_states = k_out, v_out + return k_out, v_out + + +class SqrtBoundDerivative(torch.autograd.Function): + """Computes a square root with a gradient clipped at `_MAX_SQRT_GRADIENT`.""" + + @staticmethod + def forward(ctx, x: torch.Tensor) -> torch.Tensor: + """The forward pass, which is a normal `sqrt`.""" + ctx.save_for_backward(x) + return torch.sqrt(x) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: + """The backward pass, which clips the `sqrt` gradient.""" + (x,) = ctx.saved_tensors + clipped_x_times_4 = torch.clip(4.0 * x, min=1 / (_MAX_SQRT_GRADIENT**2)) + return grad_output / torch.sqrt(clipped_x_times_4) + + +class RecurrentGemmaRglru(nn.Module): + """A Real-Gated Linear Recurrent Unit (RG-LRU) layer.""" + + def __init__(self, config): + super().__init__() + self.num_attention_heads = config.num_attention_heads + self.block_width = config.lru_width // self.num_attention_heads + + self.recurrent_param = nn.Parameter(torch.empty([config.lru_width])) + self.input_gate_weight = nn.Parameter( + torch.empty([self.num_attention_heads, self.block_width, self.block_width]) + ) + self.input_gate_bias = nn.Parameter(torch.empty([self.num_attention_heads, self.block_width])) + + self.recurrent_gate_weight = nn.Parameter( + torch.empty([self.num_attention_heads, self.block_width, self.block_width]) + ) + self.recurrent_gate_bias = nn.Parameter(torch.empty([self.num_attention_heads, self.block_width])) + self.recurrent_states = None + + def forward( + self, + activations: torch.Tensor, + position_ids: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + batch_size, seq_len, lru_width = activations.shape + reset = position_ids[:, :, None] == 0 + + reshape_act = activations.reshape(batch_size * seq_len, self.num_attention_heads, self.block_width) + reshape_act = reshape_act.permute(1, 0, 2) + + res = torch.baddbmm(self.input_gate_bias[:, None, :], reshape_act, self.input_gate_weight) + input_gate = torch.sigmoid(res.transpose(0, 1).reshape(batch_size, seq_len, lru_width)) + + res = torch.baddbmm(self.recurrent_gate_bias[:, None, :], reshape_act, self.recurrent_gate_weight) + recurrent_gate = torch.sigmoid(res.transpose(0, 1).reshape(batch_size, seq_len, lru_width)) + + # Compute the parameter `A` of the recurrence. + log_recurrent_gate = -8.0 * recurrent_gate * nn.functional.softplus(self.recurrent_param) + recurrent_gate = torch.exp(log_recurrent_gate) + a_square = torch.exp(2 * log_recurrent_gate) + + # Gate the input. + gated_inputs = activations * input_gate + + # Apply gamma normalization to the input. We need to clip the derivatives of + # `sqrt` in order to prevent NaNs during training in bfloat16. TODO a bit annoying + multiplier = 1 + tracing = isinstance(activations, torch.fx.Proxy) or ( + hasattr(torch, "_dynamo") and torch._dynamo.is_compiling() + ) + if not torch.jit.is_tracing() and not tracing: + multiplier = SqrtBoundDerivative.apply(1 - a_square) + multiplier = reset + ~reset * multiplier + normalized_x = gated_inputs * multiplier.type(activations.dtype) + + hidden_states, recurrent_states = self._rnn_scan( + hidden_states=normalized_x, + recurrent_gate=recurrent_gate, + reset=reset, + recurrent_states=self.recurrent_states, + ) + self.recurrent_states = recurrent_states + return hidden_states + + # TODO refactor + def _rnn_scan( + self, + hidden_states: torch.Tensor, + recurrent_gate: torch.Tensor, + reset: torch.Tensor, + recurrent_states: Union[torch.Tensor, None], + acc_dtype: torch.dtype = torch.float32, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Runs the recurrence of a linear RNN. + + Args: + hidden_states: The input sequence. + recurrent_gate: The diagonal of the recurrence matrix `A`. + reset: Indicator of document boundaries, e.g. when to reset the hidden state + of the RNN. + recurrent_states: The initial hidden state. + acc_dtype: The data type for the accumulation. + + Returns: + The output of the linear recurrence. + """ + # Multiply `a` by the reset. + recurrent_gate = recurrent_gate * ~reset + + if hidden_states.shape[1] == 1: + # Using scan in sampling mode. + if recurrent_states is None: # same here, when decoding you always have cache + return hidden_states, hidden_states[:, 0].type(acc_dtype) + + else: + contextualized_states = recurrent_gate.type(acc_dtype) * recurrent_states[:, None] + contextualized_states += hidden_states.type(acc_dtype) + return contextualized_states.type(hidden_states.dtype), contextualized_states[:, -1] + + else: + # Using scan in linear mode. + if recurrent_states is None: + recurrent_states = torch.zeros(hidden_states[:, 0].shape, dtype=acc_dtype, device=hidden_states.device) + + contextualized_states = torch.zeros_like(hidden_states) + for t in range(hidden_states.shape[1]): + recurrent_states = recurrent_gate[:, t].type(acc_dtype) * recurrent_states + recurrent_states = recurrent_states + hidden_states[:, t].type(acc_dtype) + contextualized_states[:, t] = recurrent_states.type(hidden_states.dtype) + + return contextualized_states, recurrent_states + + +class RecurrentGemmaRecurrentBlock(nn.Module): + """Griffin and Hawk's recurrent block.""" + + def __init__(self, config): + super().__init__() + self.lru_width = config.lru_width + self.hidden_size = config.hidden_size + self.linear_y = nn.Linear(in_features=config.hidden_size, out_features=config.lru_width) + self.linear_x = nn.Linear(in_features=config.hidden_size, out_features=config.lru_width) + self.linear_out = nn.Linear(in_features=config.lru_width, out_features=config.hidden_size) + self.conv1d_width = config.conv1d_width + self.conv_1d = nn.Conv1d( + config.lru_width, + config.lru_width, + kernel_size=config.conv1d_width, + groups=config.lru_width, + padding=config.conv1d_width - 1, + ) + self.rg_lru = RecurrentGemmaRglru(config) + self.act_fn = ACT2FN[config.hidden_activation] + + self.conv1d_state = None + + def forward( + self, + input_states: torch.Tensor, + position_ids: torch.Tensor, + attention_mask: torch.Tensor, + cache_position: torch.Tensor, + use_cache: bool = True, + ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + _, seq_len, _ = input_states.shape + + y_branch = self.linear_y(input_states) + y_branch = self.act_fn(y_branch) + + x_branch = self.linear_x(input_states) + x_branch = x_branch.transpose(1, 2) + + if use_cache: + if cache_position.shape[0] != 1: # prefill + self.conv1d_state = nn.functional.pad(x_branch, (self.conv1d_width - x_branch.shape[-1] - 1, 0)) + x_branch = self.conv_1d(x_branch)[..., :seq_len] + else: # decoding + conv_state = torch.cat((self.conv1d_state, x_branch), -1) + x_branch = torch.sum(conv_state * self.conv_1d.weight[:, 0, :], dim=-1) + self.conv_1d.bias + x_branch = x_branch.unsqueeze(-1) + self.conv1d_state = conv_state[:, :, 1:] + else: + x_branch = self.conv_1d(x_branch)[..., :seq_len] + + x_branch = self.rg_lru(x_branch.transpose(1, 2), position_ids) + + hidden_states = x_branch * y_branch + hidden_states = self.linear_out(hidden_states) + return hidden_states + + def _setup_cache(self, batch, device, dtype): + # recurrent_states always computed in full precision + self.rg_lru.recurrent_states = torch.zeros((batch, self.lru_width), device=device, dtype=torch.float32) + self.conv1d_state = torch.zeros((batch, self.hidden_size, self.conv1d_width - 1), device=device, dtype=dtype) + + +TEMPORAL_BLOCK_CLASSES = {"recurrent": RecurrentGemmaRecurrentBlock, "attention": RecurrentGemmaSdpaAttention} + + +class RecurrentGemmaMlp(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size // 2 + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=True) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=True) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=True) + self.act_fn = ACT2FN[config.hidden_activation] + + def forward(self, hidden_states): + gate = self.act_fn(self.gate_proj(hidden_states)) + return self.down_proj(gate * self.up_proj(hidden_states)) + + +class RecurrentGemmaDecoderLayer(nn.Module): + """Griffin and Hawk's residual block.""" + + def __init__(self, config, layer_idx): + super().__init__() + self.temporal_pre_norm = RecurrentGemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.temporal_block = TEMPORAL_BLOCK_CLASSES[config.layers_block_type[layer_idx]](config) + self.channel_pre_norm = RecurrentGemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.mlp_block = RecurrentGemmaMlp(config) + + def forward( + self, + activations: torch.Tensor, + position_ids: torch.Tensor, + attention_mask: torch.Tensor, + cache_position: torch.Tensor = None, + use_cache: bool = None, + ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + raw_activations = activations + inputs_normalized = self.temporal_pre_norm(raw_activations) # RMSNorm introduces slight slight differences + + hidden_states = self.temporal_block( + inputs_normalized, position_ids, attention_mask, cache_position=cache_position, use_cache=use_cache + ) + + residual = hidden_states + raw_activations + + hidden_states = self.channel_pre_norm(residual) + hidden_states = self.mlp_block(hidden_states) + + hidden_states = hidden_states + residual + return hidden_states + + +RECURRENTGEMMA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`RecurrentGemmaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare RecurrentGemma Model outputting raw hidden-states without any specific head on top.", + RECURRENTGEMMA_START_DOCSTRING, +) +class RecurrentGemmaPreTrainedModel(PreTrainedModel): + config_class = RecurrentGemmaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["RecurrentGemmaDecoderLayer"] + _skip_keys_device_placement = ["cache"] + _supports_flash_attn_2 = False + _supports_sdpa = False # we can't compare with eager for now + _supports_cache_class = True + + def _init_weights(self, module): + std = math.sqrt(self.config.w_init_variance_scale / self.config.conv1d_width) + if isinstance(module, nn.Conv1d): + torch.nn.init.normal_(module.weight, mean=0.0, std=std) + torch.nn.init.zeros_(module.bias) + elif isinstance(module, RecurrentGemmaSdpaAttention): + torch.nn.init.normal_(module.q_proj.weight, mean=0.0, std=math.sqrt(1.0 / self.config.hidden_size)) + torch.nn.init.normal_(module.k_proj.weight, mean=0.0, std=math.sqrt(1.0 / self.config.hidden_size)) + torch.nn.init.normal_(module.v_proj.weight, mean=0.0, std=math.sqrt(1.0 / self.config.hidden_size)) + + std = math.sqrt(self.config.final_w_init_variance_scale / self.config.hidden_size) + torch.nn.init.normal_(module.o_proj.weight, mean=0.0, std=std) + elif isinstance(module, RecurrentGemmaRecurrentBlock): + torch.nn.init.zeros_(module.linear_x.bias) + torch.nn.init.normal_(module.linear_x.weight, mean=0.0, std=math.sqrt(1.0 / self.config.hidden_size)) + + torch.nn.init.zeros_(module.linear_y.bias) + torch.nn.init.normal_(module.linear_y.weight, mean=0.0, std=math.sqrt(1.0 / self.config.hidden_size)) + + std = math.sqrt(self.config.final_w_init_variance_scale / self.config.lru_width) + torch.nn.init.normal_(module.linear_out.weight, mean=0.0, std=std) + torch.nn.init.zeros_(module.linear_out.bias) + elif isinstance(module, RecurrentGemmaRglru): + std = math.sqrt( + self.config.w_init_variance_scale / (self.config.lru_width // self.config.num_attention_heads) + ) + torch.nn.init.normal_(module.input_gate_weight, mean=0.0, std=std) + torch.nn.init.normal_(module.recurrent_gate_weight, mean=0.0, std=std) + torch.nn.init.zeros_(module.input_gate_bias) + torch.nn.init.zeros_(module.recurrent_gate_bias) + + module.recurrent_param.data.uniform_(0.9**2 + 1e-8, 0.999**2 + 1e-8) + module.recurrent_param.data.log_().mul_(0.5) + module.recurrent_param.data.neg_().exp_().sub_(1.0).log_() + elif isinstance(module, nn.Linear): + torch.nn.init.normal_(module.weight, mean=0.0, std=std) + if getattr(module, "bias", None) is not None: + torch.nn.init.zeros_(module.bias) + + def _setup_cache(self, config, batch, device, dtype): + layers = getattr(self, "model", self).layers + for layer in layers: + layer.temporal_block._setup_cache(batch, device, dtype) + + def reset_cache(self, batch, device, dtype): + pass + + +RECURRENTGEMMA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare RecurrentGemma Model outputting raw hidden-states without any specific head on top.", + RECURRENTGEMMA_START_DOCSTRING, +) +class RecurrentGemmaModel(RecurrentGemmaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`RecurrentGemmaDecoderLayer`] + + Args: + config: RecurrentGemmaConfig + """ + + def __init__(self, config: RecurrentGemmaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [RecurrentGemmaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.final_norm = RecurrentGemmaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.gradient_checkpointing = False + + self.register_buffer("normalizer", torch.tensor(self.config.hidden_size**0.5, dtype=torch.bfloat16)) + # Initialize weights and apply final processing + self.post_init() + + # Copied from transformers.models.llama.modeling_llama.LlamaModel.get_input_embeddings + def get_input_embeddings(self): + return self.embed_tokens + + # Copied from transformers.models.llama.modeling_llama.LlamaModel.set_input_embeddings + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(RECURRENTGEMMA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: torch.LongTensor = None, + position_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cache_position: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, BaseModelOutputWithNoAttention]: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + hidden_states = inputs_embeds + + if use_cache and inputs_embeds.shape[1] != 1: # TODO let's maybe only call in the `generate`? + self._setup_cache(self.config, hidden_states.shape[0], hidden_states.device, hidden_states.dtype) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position) + + hidden_states = hidden_states * self.normalizer.type(hidden_states.dtype) + + all_hidden_states = () if output_hidden_states else None + for i, residual_block in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.gradient_checkpointing and self.training: + hidden_states = self._gradient_checkpointing_func( + residual_block.__call__, hidden_states, position_ids, causal_mask, cache_position, use_cache + ) + else: + hidden_states = residual_block(hidden_states, position_ids, causal_mask, cache_position, use_cache) + + hidden_states = self.final_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + ) + + # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length even when the static + # KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at each decode steps due to the dynamic shapes. + # (`recording cudagraph tree for symint key 13`, etc.), which is VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using + # `fullgraph=True`. See more context in https://github.com/huggingface/transformers/pull/29114 + # Ignore copy + def _update_causal_mask(self, attention_mask, input_tensor, cache_position): + dtype, device = input_tensor.dtype, input_tensor.device + min_dtype = torch.finfo(dtype).min + sequence_length = input_tensor.shape[1] + target_length = max(self.config.attention_window_size, sequence_length) + + diagonal = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device) + causal_mask = diagonal + if sequence_length != 1: + causal_mask = torch.triu(diagonal, diagonal=-1) + + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + if attention_mask.dim() == 2: + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0) + causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype) + + if attention_mask is not None and attention_mask.device.type == "cuda": + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + +# Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM with LLAMA->RECURRENTGEMMA,Llama->RecurrentGemma,llama->gemma +class RecurrentGemmaForCausalLM(RecurrentGemmaPreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = RecurrentGemmaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + # Ignore copy + @add_start_docstrings_to_model_forward(RECURRENTGEMMA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + use_cache: Optional[bool] = None, + **kwargs, # for now we need this for generation + ) -> Union[Tuple, CausalLMOutput]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, RecurrentGemmaForCausalLM + + >>> model = RecurrentGemmaForCausalLM.from_pretrained("google/recurrentgemma-2b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/recurrentgemma-2b") + + >>> prompt = "What is your favorite condiment?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "What is your favorite condiment?" + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = True + outputs = self.model( + input_ids=input_ids, + cache_position=cache_position, + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + + # Soft-cap the logits TODO remove if always done. + # if self.config.logits_soft_cap is not None: + cap = self.config.logits_soft_cap + logits = nn.functional.tanh(logits / cap) * cap + + logits = logits.float() + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + ) + + # Ignore copy + def prepare_inputs_for_generation( + self, input_ids, attention_mask=None, inputs_embeds=None, cache_position=None, use_cache=None, **kwargs + ): + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + + attention_mask = attention_mask[:, -self.config.attention_window_size :] + + past_length = cache_position[0] + if past_length > 0: + position_ids = position_ids[:, past_length:] + + if inputs_embeds is not None: + model_inputs = {"inputs_embeds": inputs_embeds[:, past_length:]} + else: + model_inputs = {"input_ids": input_ids[:, past_length:].contiguous()} + + if cache_position is not None: + cache_position = cache_position[-position_ids.shape[1] :] + + model_inputs.update( + { + "position_ids": position_ids, + "attention_mask": attention_mask, + "cache_position": cache_position, + "use_cache": use_cache, + } + ) + return model_inputs + + # Ignore copy + def _reorder_cache(self, past_key_values, beam_idx): + for layer in self.layers: + if hasattr(layer.temporal_block, "key_states"): + k_state = layer.temporal_block.key_states + v_state = layer.temporal_block.value_states + k_state = k_state.index_select(0, beam_idx.to(k_state.device)) + v_state = v_state.index_select(0, beam_idx.to(v_state.device)) + return None diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 1bdab80a13..1c04fb9082 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -7051,6 +7051,27 @@ def load_tf_weights_in_realm(*args, **kwargs): requires_backends(load_tf_weights_in_realm, ["torch"]) +class RecurrentGemmaForCausalLM(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RecurrentGemmaModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class RecurrentGemmaPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + REFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/models/recurrent_gemma/__init__.py b/tests/models/recurrent_gemma/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py new file mode 100644 index 0000000000..ae1d9e7079 --- /dev/null +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -0,0 +1,508 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch RecurrentGemma model. """ +import unittest + +from parameterized import parameterized + +from transformers import AutoModelForCausalLM, AutoTokenizer, RecurrentGemmaConfig, is_torch_available, set_seed +from transformers.testing_utils import ( + require_bitsandbytes, + require_read_token, + require_torch, + require_torch_gpu, + slow, + torch_device, +) + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import RecurrentGemmaForCausalLM, RecurrentGemmaModel + + +class RecurrentGemmaModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=12, + is_training=True, + use_input_mask=True, + use_token_type_ids=False, + use_labels=True, + num_hidden_layers=3, + vocab_size=99, + hidden_size=32, + intermediate_size=3 * 32, + num_attention_heads=2, + lru_width=2 * 32, + embeddings_scale_by_sqrt_dim=True, + attention_window_size=16, + conv1d_width=4, + logits_soft_cap=30.0, + rms_norm_eps=1e-6, + use_cache=True, + rope_theta=10000.0, + type_vocab_size=16, + type_sequence_label_size=2, + num_labels=3, + num_choices=4, + pad_token_id=0, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + + self.num_hidden_layers = num_hidden_layers + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.lru_width = lru_width if lru_width is not None else hidden_size + self.embeddings_scale_by_sqrt_dim = embeddings_scale_by_sqrt_dim + self.attention_window_size = attention_window_size + self.conv1d_width = conv1d_width + self.logits_soft_cap = logits_soft_cap + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + + self.type_vocab_size = type_vocab_size + self.type_sequence_label_size = type_sequence_label_size + self.num_labels = num_labels + self.num_choices = num_choices + self.pad_token_id = pad_token_id + self.scope = scope + + # Copied from tests.models.mistral.test_modeling_mistral.MistralModelTester.prepare_config_and_inputs + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones(self.batch_size, self.seq_length)).to(torch_device) + + token_type_ids = None + if self.use_token_type_ids: + token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size) + + sequence_labels = None + token_labels = None + choice_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + choice_labels = ids_tensor([self.batch_size], self.num_choices) + + config = self.get_config() + + return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + + def get_config(self): + return RecurrentGemmaConfig( + num_hidden_layers=self.num_hidden_layers, + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + intermediate_size=self.intermediate_size, + num_attention_heads=self.num_attention_heads, + lru_width=self.lru_width, + embeddings_scale_by_sqrt_dim=self.embeddings_scale_by_sqrt_dim, + attention_window_size=self.attention_window_size, + conv1d_width=self.conv1d_width, + logits_soft_cap=self.logits_soft_cap, + rms_norm_eps=self.rms_norm_eps, + use_cache=self.use_cache, + rope_theta=self.rope_theta, + pad_token_id=self.pad_token_id, + output_attentions=False, + ) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model with Llama->RecurrentGemma + def create_and_check_model( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + model = RecurrentGemmaModel(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask) + result = model(input_ids) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_model_as_decoder with Llama->RecurrentGemma + def create_and_check_model_as_decoder( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.add_cross_attention = True + model = RecurrentGemmaModel(config) + model.to(torch_device) + model.eval() + result = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + ) + result = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + ) + result = model(input_ids, attention_mask=input_mask) + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_for_causal_lm with Llama->RecurrentGemma + def create_and_check_for_causal_lm( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + model = RecurrentGemmaForCausalLM(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=token_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.create_and_check_decoder_model_past_large_inputs with Llama->RecurrentGemma + def create_and_check_decoder_model_past_large_inputs( + self, + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + encoder_hidden_states, + encoder_attention_mask, + ): + config.is_decoder = True + config.add_cross_attention = True + model = RecurrentGemmaForCausalLM(config=config) + model.to(torch_device) + model.eval() + + # first forward pass + outputs = model( + input_ids, + attention_mask=input_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + use_cache=True, + ) + past_key_values = outputs.past_key_values + + # create hypothetical multiple next token and extent to next_input_ids + next_tokens = ids_tensor((self.batch_size, 3), config.vocab_size) + next_mask = ids_tensor((self.batch_size, 3), vocab_size=2) + + # append to next input_ids and + next_input_ids = torch.cat([input_ids, next_tokens], dim=-1) + next_attention_mask = torch.cat([input_mask, next_mask], dim=-1) + + output_from_no_past = model( + next_input_ids, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_hidden_states=True, + )["hidden_states"][0] + output_from_past = model( + next_tokens, + attention_mask=next_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + output_hidden_states=True, + )["hidden_states"][0] + + # select random slice + random_slice_idx = ids_tensor((1,), output_from_past.shape[-1]).item() + output_from_no_past_slice = output_from_no_past[:, -3:, random_slice_idx].detach() + output_from_past_slice = output_from_past[:, :, random_slice_idx].detach() + + self.parent.assertTrue(output_from_past_slice.shape[1] == next_tokens.shape[1]) + + # test that outputs are equal for slice + self.parent.assertTrue(torch.allclose(output_from_past_slice, output_from_no_past_slice, atol=1e-3)) + + # Copied from tests.models.llama.test_modeling_llama.LlamaModelTester.prepare_config_and_inputs_for_common with Llama->RecurrentGemma + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + token_type_ids, + input_mask, + sequence_labels, + token_labels, + choice_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_torch +class RecurrentGemmaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (RecurrentGemmaForCausalLM,) if is_torch_available() else () + # all_generative_model_classes = (RecurrentGemmaForCausalLM,) if is_torch_available() else () #TODO @gante not fully supported + pipeline_model_mapping = ( + { + "feature-extraction": RecurrentGemmaModel, + "text-generation": RecurrentGemmaForCausalLM, + } + if is_torch_available() + else {} + ) + fx_compatible = False # FIXME let's try to support this @ArthurZucker + test_torchscript = False # FIXME let's try to support this @ArthurZucker + test_missing_keys = False + test_model_parallel = False + test_pruning = False + test_head_masking = False # RecurrentGemma does not have attention heads + test_model_parallel = False + + # Need to remove 0.9 in `test_cpu_offload` + # This is because we are hitting edge cases with the causal_mask buffer + model_split_percents = [0.5, 0.6] + + # TODO (ydshieh): Check this. See https://app.circleci.com/pipelines/github/huggingface/transformers/79245/workflows/9490ef58-79c2-410d-8f51-e3495156cf9c/jobs/1012146 + def is_pipeline_test_to_skip( + self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name + ): + return True + + def setUp(self): + # We don't output attentions + self.has_attentions = False + self.model_tester = RecurrentGemmaModelTester(self) + self.config_tester = ConfigTester(self, config_class=RecurrentGemmaConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_model_various_embeddings(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + for type in ["absolute", "relative_key", "relative_key_query"]: + config_and_inputs[0].position_embedding_type = type + self.model_tester.create_and_check_model(*config_and_inputs) + + @unittest.skip("Recurrent gemma does not use legacy cache") + @parameterized.expand([(1, False), (1, True), (4, False)]) + def test_new_cache_format(self, num_beams, do_sample): + pass + + def test_save_load_fast_init_from_base(self): + pass + + @unittest.skip("RecurrentGemma does not return pkv") + def test_past_key_values_format(self): + pass + + @unittest.skip("RecurrentGemma only supports sdpa") + def test_eager_matches_sdpa_generate(self): + pass + + @unittest.skip("RecurrentGemma only supports sdpa") + def test_eager_matches_sdpa_inference(self): + pass + + @unittest.skip("RecurrentGemma does not return the cache") + def test_contrastive_generate_low_memory(self): + pass + + @unittest.skip("RecurrentGemma does not return the cache") + def test_contrastive_generate_dict_outputs_use_cache(self): + pass + + @unittest.skip("RecurrentGemma does not return the cache") + def test_contrastive_generate(self): + pass + + @unittest.skip("SQRBound is known to have issues with gc") + def test_training_gradient_checkpointing_use_reentrant_false(self): + pass + + def _check_attentions_for_generate(self, *args, **kwargs): + return True # Model does not return attention + + @unittest.skip("Past key values are not returned") + def test_prompt_lookup_decoding_matches_greedy_search(self): + pass + + @unittest.skip("Past key values are not returned") + def test_model_parallelism(self): + pass + + @unittest.skip("Past key values are not returned") + def test_model_parallel_beam_search(self): + pass + + def _check_past_key_values_for_generate(self, *args, **kwargs): + return True + + @unittest.skip("Rely on `past_key_values` to crop the assistant pkv. Not supported") + def test_assisted_decoding_matches_greedy_search(self): + pass + + @unittest.skip("RecurrentGemma's output different if you pad left or right. This is expected") + def test_left_padding_compatibility(self): + pass + + @unittest.skip("Relies on `past_key_values` returned by the model. Not supported with recurrent gemma") + def test_assisted_decoding_sample(self): + pass + + def _check_hidden_states_for_generate( + self, batch_size, hidden_states, min_length, max_length, config, use_cache=False, num_beam_groups=1 + ): + self.assertIsInstance(hidden_states, tuple) + self.assertListEqual( + [isinstance(iter_hidden_states, tuple) for iter_hidden_states in hidden_states], + [True] * len(hidden_states), + ) + self.assertEqual(len(hidden_states), (max_length - min_length) * num_beam_groups) + + for idx, iter_hidden_states in enumerate(hidden_states): + seq_len = min_length + idx if not use_cache else 1 + expected_shape = (batch_size * num_beam_groups, seq_len, config.hidden_size) + # check hidden size + self.assertListEqual( + [layer_hidden_states.shape for layer_hidden_states in iter_hidden_states], + [expected_shape] * len(iter_hidden_states), + ) + + @unittest.skip("TODO @arthurzucker not super important and failing.") + def test_initialization(self): + pass + + +@require_torch_gpu +@slow +class RecurrentGemmaIntegrationTest(unittest.TestCase): + input_text = ["Hello I am doing", "Hi today"] + model_id = "google/recurrentgemma-2b" + + @require_read_token + def test_2b_generate(self): + EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking for some information on the topic. I am looking for some information on the impact of the internet on the society. I am looking for some information on the impact of the internet on the society. I am looking for some', 'Hi today is a very good day for you. You will be able to do all the work you want to do. You will be able to do all the work you want to do. You will be able to do all the work you want to do. You will be able to do all the work you want to do.'] # fmt: skip + model = AutoModelForCausalLM.from_pretrained(self.model_id, low_cpu_mem_usage=True).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id) + tokenizer.padding_side = "right" + + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=64, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + tokenizer.padding_side = "left" + EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking for some information on the topic. I am looking for some information on the impact of the internet on the society. I am looking for some information on the impact of the internet on the society. I am looking for some', 'Hi today I am going to share with you the best free online video editing software.\n\n

Best Free Online Video Editing Software

\n\n1. Wondershare Filmora\n\nWondershare Filmora is a free online video editing software that is used to edit videos.'] # fmt: skip + + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + output = model.generate(**inputs, max_new_tokens=64, do_sample=False) + del model + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 + ).to(torch_device) + output = model.generate(**inputs, max_new_tokens=64, do_sample=False) + del model + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_2b_sample(self): + set_seed(0) + EXPECTED_TEXT = ['Where is Paris ?\n\nChoose the word or phrase that is closest in meaning to the word in capital letters.\n\nREDEEM\n(A) sort out\n(B) think over\n(C) turn in\n(D) take back\n\nWrite the correct word in the space next to each definition. Use each word only once.\n\nto badly damage\n\nOn the lines provided below, write P if the underlined word group is a phrase and NP if it is not a phrase. Example $\\underline{\\text{P}}$ 1. We have finally discovered the secret $\\underline{\\text{of delicious pizza. }}$'] # fmt: skip + model = AutoModelForCausalLM.from_pretrained(self.model_id).to(torch_device) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id) + inputs = tokenizer("Where is Paris ?", return_tensors="pt", padding=True).to(torch_device) + output = model.generate(**inputs, max_new_tokens=128, do_sample=True) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXT) + + @require_bitsandbytes + @require_read_token + def test_model_2b_8bit(self): + EXPECTED_TEXTS = ['Hello I am doing a project on the topic of "The impact of the internet on the society" and I am looking', "Hi today I'm going to show you how to make a simple and easy to use "] # fmt: skip + + model = AutoModelForCausalLM.from_pretrained( + "gg-hf/recurrent-gemma-2b-hf", device_map={"": torch_device}, load_in_8bit=True, torch_dtype=torch.bfloat16 + ) + + tokenizer = AutoTokenizer.from_pretrained(self.model_id) + inputs = tokenizer(self.input_text, return_tensors="pt", padding=True).to(torch_device) + + output = model.generate(**inputs, max_new_tokens=20, do_sample=False) + output_text = tokenizer.batch_decode(output, skip_special_tokens=True) + + self.assertEqual(output_text, EXPECTED_TEXTS) + + @require_read_token + def test_long_context(self): + input_text = [ + 'Marseille, France (CNN)The French prosecutor leading an investigation into the crash of Germanwings Flight 9525 insisted Wednesday that he was not aware of any video footage from on board the plane. Marseille prosecutor Brice Robin told CNN that "so far no videos were used in the crash investigation." He added, "A person who has such a video needs to immediately give it to the investigators." Robin\'s comments follow claims by two magazines, German daily Bild and French Paris Match, of a cell phone video showing the harrowing final seconds from on board Germanwings Flight 9525 as it crashed into the French Alps. All 150 on board were killed. Paris Match and Bild reported that the video was recovered from a phone at the wreckage site. The two publications described the supposed video, but did not post it on their websites. The publications said that they watched the video, which was found by a source close to the investigation. "One can hear cries of \'My God\' in several languages," Paris Match reported. "Metallic banging can also be heard more than three times, perhaps of the pilot trying to open the cockpit door with a heavy object. Towards the end, after a heavy shake, stronger than the others, the screaming intensifies. Then nothing." "It is a very disturbing scene," said Julian Reichelt, editor-in-chief of Bild online. An official with France\'s accident investigation agency, the BEA, said the agency is not aware of any such video. Lt. Col.' + ] + EXPECTED_GENERATION = [ + ' Jean-Paul Delannoy told CNN that the BEA is "not aware of any video footage that could have been taken on board the plane." "We are not aware of any video footage that could have been taken on board the plane," Delannoy said. "We are not aware of any video footage that could' + ] + + model = AutoModelForCausalLM.from_pretrained( + self.model_id, low_cpu_mem_usage=True, torch_dtype=torch.float16 + ).to(torch_device) + tokenizer = AutoTokenizer.from_pretrained(self.model_id) + inputs = tokenizer(input_text, return_tensors="pt").to(torch_device) + output = model.generate(**inputs, max_new_tokens=64, do_sample=False) + output_text = tokenizer.batch_decode(output[:, inputs.input_ids.shape[1] :], skip_special_tokens=True) + self.assertEqual(output_text, EXPECTED_GENERATION) diff --git a/utils/check_config_attributes.py b/utils/check_config_attributes.py index 140cd560e0..736ee681c3 100644 --- a/utils/check_config_attributes.py +++ b/utils/check_config_attributes.py @@ -34,6 +34,8 @@ CONFIG_MAPPING = transformers.models.auto.configuration_auto.CONFIG_MAPPING SPECIAL_CASES_TO_ALLOW = { # used to compute the property `self.chunk_length` "EncodecConfig": ["overlap"], + # used to compute the property `self.layers_block_type` + "RecurrentGemmaConfig": ["block_types"], # used as in the config to define `intermediate_size` "MambaConfig": ["expand"], # used as `self.bert_model = BertModel(config, ...)` diff --git a/utils/check_repo.py b/utils/check_repo.py index f577bf1507..13dcd6ad97 100644 --- a/utils/check_repo.py +++ b/utils/check_repo.py @@ -86,6 +86,7 @@ PRIVATE_MODELS = [ # Being in this list is an exception and should **not** be the rule. IGNORE_NON_TESTED = PRIVATE_MODELS.copy() + [ # models to ignore for not tested + "RecurrentGemmaModel", # Building part of bigger (tested) model. "FuyuForCausalLM", # Not tested fort now "InstructBlipQFormerModel", # Building part of bigger (tested) model. "UMT5EncoderModel", # Building part of bigger (tested) model. diff --git a/utils/not_doctested.txt b/utils/not_doctested.txt index b57ed3b791..4ac104ee2e 100644 --- a/utils/not_doctested.txt +++ b/utils/not_doctested.txt @@ -768,6 +768,7 @@ src/transformers/models/rag/modeling_tf_rag.py src/transformers/models/rag/retrieval_rag.py src/transformers/models/realm/modeling_realm.py src/transformers/models/realm/retrieval_realm.py +src/transformers/models/recurrent_gemma/modeling_recurrent_gemma.py src/transformers/models/reformer/convert_reformer_trax_checkpoint_to_pytorch.py src/transformers/models/regnet/configuration_regnet.py src/transformers/models/regnet/convert_regnet_seer_10b_to_pytorch.py From 3280b13260bb0863f7904057e12a292386fe888c Mon Sep 17 00:00:00 2001 From: "Etienne.bfx" Date: Wed, 10 Apr 2024 17:07:58 +0200 Subject: [PATCH 075/110] Fixing a bug when MlFlow try to log a torch.tensor (#29932) * Update integration_utils.py Add the case where a tensor with one element is log with Mlflow * Update src/transformers/integrations/integration_utils.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update integration_utils.py add a whitespace --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- src/transformers/integrations/integration_utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/transformers/integrations/integration_utils.py b/src/transformers/integrations/integration_utils.py index 330fccb20d..00074a9574 100644 --- a/src/transformers/integrations/integration_utils.py +++ b/src/transformers/integrations/integration_utils.py @@ -1071,6 +1071,8 @@ class MLflowCallback(TrainerCallback): for k, v in logs.items(): if isinstance(v, (int, float)): metrics[k] = v + elif isinstance(v, torch.Tensor) and v.numel() == 1: + metrics[k] = v.item() else: logger.warning( f'Trainer is attempting to log a value of "{v}" of type {type(v)} for key "{k}" as a metric. ' From 4f7a9f9c5c6940015a5fbf2f23f85e2e6cae0891 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Wed, 10 Apr 2024 17:45:49 +0200 Subject: [PATCH 076/110] Fix natten install in docker (#30161) * fix dinat in docker * update --------- Co-authored-by: ydshieh --- docker/transformers-all-latest-gpu/Dockerfile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docker/transformers-all-latest-gpu/Dockerfile b/docker/transformers-all-latest-gpu/Dockerfile index 6f122ed160..682ec1355e 100644 --- a/docker/transformers-all-latest-gpu/Dockerfile +++ b/docker/transformers-all-latest-gpu/Dockerfile @@ -46,7 +46,8 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/opt RUN python3 -m pip install --no-cache-dir decord av==9.2.0 # For `dinat` model -RUN python3 -m pip install --no-cache-dir 'natten<0.15.0' -f https://shi-labs.com/natten/wheels/$CUDA/ +# The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent) +RUN python3 -m pip install --no-cache-dir natten==0.15.1+torch220$CUDA -f https://shi-labs.com/natten/wheels # For `nougat` tokenizer RUN python3 -m pip install --no-cache-dir python-Levenshtein From f569172fc283fb5139751a0fefa81873faf87fa5 Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Wed, 10 Apr 2024 18:12:43 +0200 Subject: [PATCH 077/110] FIX / bnb: fix torch compatiblity issue with `itemize` (#30162) * fix torch compatiblity issues * fix * Update src/transformers/modeling_utils.py --- src/transformers/modeling_utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 9f22333839..8a107694a1 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -1159,9 +1159,12 @@ class ModuleUtilsMixin: # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are # used for the 4bit quantization (uint8 tensors are stored) if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit): - total_numel.append( - param.numel() * 2 * self.hf_quantizer.quantization_config.bnb_4bit_quant_storage.itemsize + quant_storage = self.hf_quantizer.quantization_config.bnb_4bit_quant_storage + # For compatibility with older PT version - see: https://github.com/huggingface/peft/pull/1635 + nb_params = ( + quant_storage.itemsize if hasattr(quant_storage, "itemsize") else quant_storage.element_size() ) + total_numel.append(param.numel() * 2 * nb_params) else: total_numel.append(param.numel()) From d71f5b3ea8b4a3a530d0c868e87d57311b177c1c Mon Sep 17 00:00:00 2001 From: Matt Date: Wed, 10 Apr 2024 17:24:32 +0100 Subject: [PATCH 078/110] Update config class check in auto factory (#29854) --- src/transformers/models/auto/auto_factory.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/auto/auto_factory.py b/src/transformers/models/auto/auto_factory.py index 98c0e851bc..e53dcab379 100644 --- a/src/transformers/models/auto/auto_factory.py +++ b/src/transformers/models/auto/auto_factory.py @@ -579,7 +579,7 @@ class _BaseAutoModelClass: model_class ([`PreTrainedModel`]): The model to register. """ - if hasattr(model_class, "config_class") and model_class.config_class != config_class: + if hasattr(model_class, "config_class") and str(model_class.config_class) != str(config_class): raise ValueError( "The model class you are passing has a `config_class` attribute that is not consistent with the " f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix " From a5e5c92aea1e99cb84d7342bd63826ca6cd884c4 Mon Sep 17 00:00:00 2001 From: DamonGuzman <106456287+DamonGuzman@users.noreply.github.com> Date: Wed, 10 Apr 2024 14:52:51 -0700 Subject: [PATCH 079/110] Fixed typo in comments/documentation for Pipelines documentation (#30170) Update feature_extraction.py - Fixed typo in comments/documentation --- src/transformers/pipelines/feature_extraction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/pipelines/feature_extraction.py b/src/transformers/pipelines/feature_extraction.py index e8adb11b68..7d67a615ac 100644 --- a/src/transformers/pipelines/feature_extraction.py +++ b/src/transformers/pipelines/feature_extraction.py @@ -24,7 +24,7 @@ class FeatureExtractionPipeline(Pipeline): >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction") >>> result = extractor("This is a simple test.", return_tensors=True) - >>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input string. + >>> result.shape # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string. torch.Size([1, 8, 768]) ``` From b752ad3019afbea95605b743a5c278dfccbe701e Mon Sep 17 00:00:00 2001 From: Eduardo Pacheco <69953243+EduardoPach@users.noreply.github.com> Date: Thu, 11 Apr 2024 09:32:16 +0200 Subject: [PATCH 080/110] Adding grounding dino (#26087) * Fixed typo when converting weigths to GroundingDINO vision backbone * Final modifications on modeling * Removed unnecessary class * Fixed convert structure * Added image processing * make fixup partially completed * Now text_backbone_config has its own class * Modified convert script * Removed unnecessary config attribute * Added new function to generate sub sentence mask * Renamed parameters with gamma in the name as it's currently not allowed * Removed tokenization and image_processing scripts since we'll map from existing models * Fixed some issues with configuration * Just some modifications on conversion script * Other modifications * Copied deformable detr * First commit * Added bert to model * Bert validated * Created Text and Fusion layers for Encoder * Adapted Encoder layer * Fixed typos * Adjusted Encoder * Converted encoder to hf * Modified Decoder Layer * Modified main decoder class * Removed copy comments * Fixed forward from GroundingDINOModel and GroundingDINODecoder * Added all necessary layers, configurations and forward logic up to GroundingDINOModel * Added all layers to convertion * Fixed outputs for GroundingDINOModel and GroundingDINOForObjectDetection * Fixed mask input to encoders and fixed nn.MultiheadAttention batch first and attn output * Fixed forward from GroundingDINOTextEnhancerLayer * Fixed output bug with GroundingDINODeformableLayer * Fixed bugs that prevent GroundingDINOForObjectDetection to run forward method * Fixed attentions to be passed correctly * Passing temperature arg when creating Sine position embedding * Removed copy comments * Added temperature argument for position embedding * Fixed typo when converting weigths to GroundingDINO vision backbone * Final modifications on modeling * Removed unnecessary class * Fixed convert structure * Added image processing * make fixup partially completed * Now text_backbone_config has its own class * Modified convert script * Removed unnecessary config attribute * Added new function to generate sub sentence mask * Renamed parameters with gamma in the name as it's currently not allowed * Removed tokenization and image_processing scripts since we'll map from existing models * Fixed some issues with configuration * Just some modifications on conversion script * Other modifications * Fix style * Improve fixup * Improve conversion script * Improve conversion script * Add GroundingDINOProcessor * More improvements * Return token type ids * something * Fix more tests * More improvements * More cleanup * More improvements * Fixed tests, improved modeling and config * More improvements and fixing tests * Improved tests and modeling * Improved tests and added image processor * Improved tests inference * More improvements * More test improvements * Fixed last test * Improved docstrings and comments * Fix style * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Better naming * Better naming * Added Copied statement * Added Copied statement * Moved param init from GroundingDINOBiMultiHeadAttention * Better naming * Fixing clamp style * Better naming * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> * Update src/transformers/models/grounding_dino/configuration_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Improving conversion script * Improved config * Improved naming * Improved naming again * Improved grouding-dino.md * Moved grounding dino to multimodal * Update src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> * Fixed docstrings and style * Fix docstrings * Remove timm attributes * Reorder imports * More improvements * Add Grounding DINO to pipeline * Remove model from check_repo * Added grounded post_process to GroundingDINOProcessor * Fixed style * Fixed GroundingDINOTextPrenetConfig docstrings * Aligned inputs.keys() when both image and text are passed with model_input_names * Added tests for GroundingDINOImageProcessor and GroundingDINOProcessor * Testing post_process_grounded_object_detection from GroundingDINOProcessor at test_inference_object_detection_head * Fixed order * Marked test with require_torch * Temporarily changed repo_id * More improvements * Fix style * Final improvements * Improve annotators * Fix style * Add is_torch_available * Remove type hints * vocab_tokens as one liner * Removed print statements * Renamed GroundingDINOTextPrenetConfig to GroundingDINOTextConfig * remove unnecessary comments * Removed unnecessary tests on conversion script * Renamed GroundingDINO to camel case GroundingDino * Fixed GroundingDinoProcessor docstrings * loading MSDA kernels in the modeling file * Fix copies * Replace nn.multiheadattention * Replace nn.multiheadattention * Fixed inputs for GroundingDinoMultiheadAttention & order of modules * Fixed processing to avoid messing with inputs * Added more tips for GroundingDino * Make style * Chaning name to align with SAM * Replace final nn.multiheadattention * Fix model tests * Update year, remove GenerationTesterMixin * Address comments * Address more comments * Rename TextPrenet to TextModel * Rename hidden_states * Address more comments * Address more comments * Address comment * Address more comments * Address merge * Address comment * Address comment * Address comment * Make style * Added layer norm eps to layer norms * Address more comments * More fixes * Fixed equivalence * Make fixup * Remove print statements * Address comments * Address comments * Address comments * Address comments * Address comments * Address comments * Add comment * Address comment * Remove overwriting of test * Fix bbox_embed * Improve decoder_bbox_embed_share * Simplify outputs * Updated post_process_grounded_object_detection * Renamed sources to feature_maps * Improved tests for Grounding Dino ImageProcessor and Processor * Fixed test requirements and imports * Fixed image_processing * Fixed processor tests * Fixed imports for image processing tests * Fix copies * Updated modeling * Fix style * Moved functions to correct position * Fixed copy issues * Update src/transformers/models/deformable_detr/modeling_deformable_detr.py Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> * Keeping consistency custom cuda kernels for MSDA * Make GroundingDinoProcessor logic clearer * Updated Grounding DINO checkpoints * Changed tests to correct structure * Updated gpu-cpu equivalence test * fix copies * Update src/transformers/models/grounding_dino/processing_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/processing_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/modeling_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/grounding_dino/configuration_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Fixed erros and style * Fix copies * Removed inheritance from PreTrainedModel from GroundingDinoTextModel * Fixed GroundingDinoTextModel * Fixed type of default backbone config * Fixed missing methods for GroundingDinoTextModel and Added timm support for GroundingDinoConvEncoder * Addressed comments * Addressed batched image processing tests * Addressed zero shot test comment * Addressed tip comment * Removed GroundingDinoTextModel from check_repo * Removed inplace masking * Addressed comments * Addressed comments * Addressed comments * Fix copies * Fixing timm test * Fixed batching equivalence test * Update docs/source/en/model_doc/grounding-dino.md Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com> * Update docs/source/en/model_doc/grounding-dino.md Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com> * Update docs/source/en/model_doc/grounding-dino.md Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com> * Addressed more comments * Added a new comment * Reduced image size * Addressed more comments * Nits * Nits * Changed the way text_config is initialized * Update src/transformers/models/grounding_dino/processing_grounding_dino.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --------- Co-authored-by: Niels Co-authored-by: Rafael Padilla <31217453+rafaelpadilla@users.noreply.github.com> Co-authored-by: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Co-authored-by: Eduardo Pacheco Co-authored-by: Sangbum Daniel Choi <34004152+SangbumChoi@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Co-authored-by: Tianqi Xu <40522713+dandansamax@users.noreply.github.com> --- README.md | 1 + README_de.md | 1 + README_es.md | 1 + README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 1 + README_ru.md | 1 + README_te.md | 1 + README_vi.md | 1 + README_zh-hans.md | 1 + README_zh-hant.md | 1 + docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/grounding-dino.md | 97 + src/transformers/__init__.py | 32 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 2 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 2 + .../models/auto/tokenization_auto.py | 1 + .../modeling_deformable_detr.py | 27 +- src/transformers/models/deta/modeling_deta.py | 5 +- .../models/grounding_dino/__init__.py | 81 + .../configuration_grounding_dino.py | 301 ++ .../convert_grounding_dino_to_hf.py | 491 +++ .../image_processing_grounding_dino.py | 1511 ++++++++ .../grounding_dino/modeling_grounding_dino.py | 3132 +++++++++++++++++ .../processing_grounding_dino.py | 228 ++ src/transformers/utils/dummy_pt_objects.py | 24 + .../utils/dummy_vision_objects.py | 7 + tests/models/grounding_dino/__init__.py | 0 .../test_image_processing_grounding_dino.py | 530 +++ .../test_modeling_grounding_dino.py | 689 ++++ .../test_processor_grounding_dino.py | 253 ++ 36 files changed, 7411 insertions(+), 20 deletions(-) create mode 100644 docs/source/en/model_doc/grounding-dino.md create mode 100644 src/transformers/models/grounding_dino/__init__.py create mode 100644 src/transformers/models/grounding_dino/configuration_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py create mode 100644 src/transformers/models/grounding_dino/image_processing_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/modeling_grounding_dino.py create mode 100644 src/transformers/models/grounding_dino/processing_grounding_dino.py create mode 100644 tests/models/grounding_dino/__init__.py create mode 100644 tests/models/grounding_dino/test_image_processing_grounding_dino.py create mode 100644 tests/models/grounding_dino/test_modeling_grounding_dino.py create mode 100644 tests/models/grounding_dino/test_processor_grounding_dino.py diff --git a/README.md b/README.md index bb06321f12..de844848a4 100644 --- a/README.md +++ b/README.md @@ -389,6 +389,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_de.md b/README_de.md index 3eb2c63b03..e5bd3522ca 100644 --- a/README_de.md +++ b/README_de.md @@ -385,6 +385,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_es.md b/README_es.md index 202411bbf8..5f6a2afb7c 100644 --- a/README_es.md +++ b/README_es.md @@ -362,6 +362,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_fr.md b/README_fr.md index 7d6d2e765a..9c6f71d324 100644 --- a/README_fr.md +++ b/README_fr.md @@ -383,6 +383,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (de BigCode) a été publié dans l'article [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) par Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** a été publié dans le dépôt [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) par Toshiyuki Sakamoto (tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (de Microsoft) a été publié dans l'article [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) par Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (de Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) publié dans l'article [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) parShilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (de l'UCSD, NVIDIA) a été publié dans l'article [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) par Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (d'Allegro.pl, AGH University of Science and Technology) a été publié dans l'article [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) par Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_hd.md b/README_hd.md index 33ae62baf0..19dfa18b26 100644 --- a/README_hd.md +++ b/README_hd.md @@ -336,6 +336,7 @@ conda install conda-forge::transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode से) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. द्वाराअनुसंधान पत्र [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) के साथ जारी किया गया 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others से) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. द्वाराअनुसंधान पत्र [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) के साथ जारी किया गया 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA से) साथ में कागज [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) जियारुई जू, शालिनी डी मेलो, सिफ़ी लियू, वोनमिन बायन, थॉमस ब्रेउएल, जान कौट्ज़, ज़ियाओलोंग वांग द्वारा। 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology से) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. द्वाराअनुसंधान पत्र [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) के साथ जारी किया गया 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। diff --git a/README_ja.md b/README_ja.md index dbe2cecdbd..443f650cae 100644 --- a/README_ja.md +++ b/README_ja.md @@ -396,6 +396,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode から) Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. から公開された研究論文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) 坂本俊之(tanreinama)からリリースされました. 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (Microsoft から) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu から公開された研究論文: [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234). +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others から) Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. から公開された研究論文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA から) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang から公開された研究論文: [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology から) Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. から公開された研究論文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) diff --git a/README_ko.md b/README_ko.md index 547572c4f2..dfb271b737 100644 --- a/README_ko.md +++ b/README_ko.md @@ -311,6 +311,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (BigCode 에서 제공)은 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.의 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988)논문과 함께 발표했습니다. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu 의 [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) 논문과 함께 발표했습니다. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others 에서 제공)은 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang.의 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499)논문과 함께 발표했습니다. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (UCSD, NVIDIA 에서) Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 의 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 논문과 함께 발표했습니다. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (Allegro.pl, AGH University of Science and Technology 에서 제공)은 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.의 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf)논문과 함께 발표했습니다. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. diff --git a/README_pt-br.md b/README_pt-br.md index ad3e6bb90d..8dcdddac00 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -394,6 +394,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_ru.md b/README_ru.md index 7c5cb5f179..7ee48cbcaf 100644 --- a/README_ru.md +++ b/README_ru.md @@ -384,6 +384,7 @@ conda install conda-forge::transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_te.md b/README_te.md index 90a443e434..687a97d697 100644 --- a/README_te.md +++ b/README_te.md @@ -386,6 +386,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_vi.md b/README_vi.md index 98114bf0a8..1a872b9ce5 100644 --- a/README_vi.md +++ b/README_vi.md @@ -385,6 +385,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (từ BigCode) được phát hành với bài báo [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (từ Microsoft) được phát hành với bài báo [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (từ Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) được phát hành với bài báo [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (từ UCSD, NVIDIA) được phát hành với bài báo [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (từ Allegro.pl, AGH University of Science and Technology) được phát hành với bài báo [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) được phát hành với bài báo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/README_zh-hans.md b/README_zh-hans.md index bd7457b25f..7e307c781a 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -335,6 +335,7 @@ conda install conda-forge::transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (来自 BigCode) 伴随论文 [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) 由 Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra 发布。 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (来自 Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) 伴随论文 [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) 由 Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang 发布。 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (来自 UCSD, NVIDIA) 伴随论文 [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) 由 Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang 发布。 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (来自 Allegro.pl, AGH University of Science and Technology) 伴随论文 [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) 由 Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik 发布。 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 1eb46fe700..1431293df5 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -347,6 +347,7 @@ conda install conda-forge::transformers 1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra. 1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by 坂本俊之(tanreinama). 1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu. +1. **[Grounding DINO](https://huggingface.co/docs/transformers/main/model_doc/grounding-dino)** (from Institute for AI, Tsinghua-Bosch Joint Center for ML, Tsinghua University, IDEA Research and others) released with the paper [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. 1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang. 1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik. 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 7daf91c99d..89529a375d 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -732,6 +732,8 @@ title: FLAVA - local: model_doc/git title: GIT + - local: model_doc/grounding-dino + title: Grounding DINO - local: model_doc/groupvit title: GroupViT - local: model_doc/idefics diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 9c5c87d00f..701f0a7e6b 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -154,6 +154,7 @@ Flax), PyTorch, and/or TensorFlow. | [GPTBigCode](model_doc/gpt_bigcode) | ✅ | ❌ | ❌ | | [GPTSAN-japanese](model_doc/gptsan-japanese) | ✅ | ❌ | ❌ | | [Graphormer](model_doc/graphormer) | ✅ | ❌ | ❌ | +| [Grounding DINO](model_doc/grounding-dino) | ✅ | ❌ | ❌ | | [GroupViT](model_doc/groupvit) | ✅ | ✅ | ❌ | | [HerBERT](model_doc/herbert) | ✅ | ✅ | ✅ | | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | diff --git a/docs/source/en/model_doc/grounding-dino.md b/docs/source/en/model_doc/grounding-dino.md new file mode 100644 index 0000000000..3c6bd6fce0 --- /dev/null +++ b/docs/source/en/model_doc/grounding-dino.md @@ -0,0 +1,97 @@ + + +# Grounding DINO + +## Overview + +The Grounding DINO model was proposed in [Grounding DINO: Marrying DINO with Grounded Pre-Training for Open-Set Object Detection](https://arxiv.org/abs/2303.05499) by Shilong Liu, Zhaoyang Zeng, Tianhe Ren, Feng Li, Hao Zhang, Jie Yang, Chunyuan Li, Jianwei Yang, Hang Su, Jun Zhu, Lei Zhang. Grounding DINO extends a closed-set object detection model with a text encoder, enabling open-set object detection. The model achieves remarkable results, such as 52.5 AP on COCO zero-shot. + +The abstract from the paper is the following: + +*In this paper, we present an open-set object detector, called Grounding DINO, by marrying Transformer-based detector DINO with grounded pre-training, which can detect arbitrary objects with human inputs such as category names or referring expressions. The key solution of open-set object detection is introducing language to a closed-set detector for open-set concept generalization. To effectively fuse language and vision modalities, we conceptually divide a closed-set detector into three phases and propose a tight fusion solution, which includes a feature enhancer, a language-guided query selection, and a cross-modality decoder for cross-modality fusion. While previous works mainly evaluate open-set object detection on novel categories, we propose to also perform evaluations on referring expression comprehension for objects specified with attributes. Grounding DINO performs remarkably well on all three settings, including benchmarks on COCO, LVIS, ODinW, and RefCOCO/+/g. Grounding DINO achieves a 52.5 AP on the COCO detection zero-shot transfer benchmark, i.e., without any training data from COCO. It sets a new record on the ODinW zero-shot benchmark with a mean 26.1 AP.* + + + + Grounding DINO overview. Taken from the original paper. + +This model was contributed by [EduardoPacheco](https://huggingface.co/EduardoPacheco) and [nielsr](https://huggingface.co/nielsr). +The original code can be found [here](https://github.com/IDEA-Research/GroundingDINO). + +## Usage tips + +- One can use [`GroundingDinoProcessor`] to prepare image-text pairs for the model. +- To separate classes in the text use a period e.g. "a cat. a dog." +- When using multiple classes (e.g. `"a cat. a dog."`), use `post_process_grounded_object_detection` from [`GroundingDinoProcessor`] to post process outputs. Since, the labels returned from `post_process_object_detection` represent the indices from the model dimension where prob > threshold. + +Here's how to use the model for zero-shot object detection: + +```python +import requests + +import torch +from PIL import Image +from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, + +model_id = "IDEA-Research/grounding-dino-tiny" + +processor = AutoProcessor.from_pretrained(model_id) +model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device) + +image_url = "http://images.cocodataset.org/val2017/000000039769.jpg" +image = Image.open(requests.get(image_url, stream=True).raw) +# Check for cats and remote controls +text = "a cat. a remote control." + +inputs = processor(images=image, text=text, return_tensors="pt").to(device) +with torch.no_grad(): + outputs = model(**inputs) + +results = processor.post_process_grounded_object_detection( + outputs, + inputs.input_ids, + box_threshold=0.4, + text_threshold=0.3, + target_sizes=[image.size[::-1]] +) +``` + + +## GroundingDinoImageProcessor + +[[autodoc]] GroundingDinoImageProcessor + - preprocess + - post_process_object_detection + +## GroundingDinoProcessor + +[[autodoc]] GroundingDinoProcessor + - post_process_grounded_object_detection + +## GroundingDinoConfig + +[[autodoc]] GroundingDinoConfig + +## GroundingDinoModel + +[[autodoc]] GroundingDinoModel + - forward + +## GroundingDinoForObjectDetection + +[[autodoc]] GroundingDinoForObjectDetection + - forward diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 06a6a0859b..666c5f980a 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -488,9 +488,11 @@ _import_structure = { "GPTSanJapaneseConfig", "GPTSanJapaneseTokenizer", ], - "models.graphormer": [ - "GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", - "GraphormerConfig", + "models.graphormer": ["GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "GraphormerConfig"], + "models.grounding_dino": [ + "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GroundingDinoConfig", + "GroundingDinoProcessor", ], "models.groupvit": [ "GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP", @@ -1331,6 +1333,7 @@ else: _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"]) _import_structure["models.fuyu"].extend(["FuyuImageProcessor", "FuyuProcessor"]) _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) + _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"]) _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) @@ -2391,6 +2394,14 @@ else: "GraphormerPreTrainedModel", ] ) + _import_structure["models.grounding_dino"].extend( + [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDinoForObjectDetection", + "GroundingDinoModel", + "GroundingDinoPreTrainedModel", + ] + ) _import_structure["models.groupvit"].extend( [ "GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5380,9 +5391,11 @@ if TYPE_CHECKING: GPTSanJapaneseConfig, GPTSanJapaneseTokenizer, ) - from .models.graphormer import ( - GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, - GraphormerConfig, + from .models.graphormer import GRAPHORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, GraphormerConfig + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, + GroundingDinoConfig, + GroundingDinoProcessor, ) from .models.groupvit import ( GROUPVIT_PRETRAINED_CONFIG_ARCHIVE_MAP, @@ -6195,6 +6208,7 @@ if TYPE_CHECKING: ) from .models.fuyu import FuyuImageProcessor, FuyuProcessor from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor + from .models.grounding_dino import GroundingDinoImageProcessor from .models.idefics import IdeficsImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor from .models.layoutlmv2 import ( @@ -7112,6 +7126,12 @@ if TYPE_CHECKING: GraphormerModel, GraphormerPreTrainedModel, ) + from .models.grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDinoForObjectDetection, + GroundingDinoModel, + GroundingDinoPreTrainedModel, + ) from .models.groupvit import ( GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST, GroupViTModel, diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index 4a5cd01add..f6db5e164b 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -105,6 +105,7 @@ from . import ( gptj, gptsan_japanese, graphormer, + grounding_dino, groupvit, herbert, hubert, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index 0d5d9ae62b..e5dd68c189 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -120,6 +120,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("gptj", "GPTJConfig"), ("gptsan-japanese", "GPTSanJapaneseConfig"), ("graphormer", "GraphormerConfig"), + ("grounding-dino", "GroundingDinoConfig"), ("groupvit", "GroupViTConfig"), ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), @@ -383,6 +384,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("gptj", "GPT-J"), ("gptsan-japanese", "GPTSAN-japanese"), ("graphormer", "Graphormer"), + ("grounding-dino", "Grounding DINO"), ("groupvit", "GroupViT"), ("herbert", "HerBERT"), ("hubert", "Hubert"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 3debf97fea..6ae28bfa32 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -68,6 +68,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("fuyu", "FuyuImageProcessor"), ("git", "CLIPImageProcessor"), ("glpn", "GLPNImageProcessor"), + ("grounding-dino", "GroundingDinoImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 6f3d9d17a3..cf45f4a8d3 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -115,6 +115,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("gptj", "GPTJModel"), ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("graphormer", "GraphormerModel"), + ("grounding-dino", "GroundingDinoModel"), ("groupvit", "GroupViTModel"), ("hubert", "HubertModel"), ("ibert", "IBertModel"), @@ -753,6 +754,7 @@ MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( [ # Model for Zero Shot Object Detection mapping + ("grounding-dino", "GroundingDinoForObjectDetection"), ("owlv2", "Owlv2ForObjectDetection"), ("owlvit", "OwlViTForObjectDetection"), ] diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index af30469f9c..d23ce040a9 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -195,6 +195,7 @@ else: ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)), + ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), ("hubert", ("Wav2Vec2CTCTokenizer", None)), diff --git a/src/transformers/models/deformable_detr/modeling_deformable_detr.py b/src/transformers/models/deformable_detr/modeling_deformable_detr.py index 1e2296d177..c0ac7cffc7 100755 --- a/src/transformers/models/deformable_detr/modeling_deformable_detr.py +++ b/src/transformers/models/deformable_detr/modeling_deformable_detr.py @@ -710,13 +710,14 @@ class DeformableDetrMultiscaleDeformableAttention(nn.Module): batch_size, num_queries, self.n_heads, self.n_levels, self.n_points ) # batch_size, num_queries, n_heads, n_levels, n_points, 2 - if reference_points.shape[-1] == 2: + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = ( reference_points[:, :, None, :, None, :] + sampling_offsets / offset_normalizer[None, None, None, :, None, :] ) - elif reference_points.shape[-1] == 4: + elif num_coordinates == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 @@ -1401,14 +1402,15 @@ class DeformableDetrDecoder(DeformableDetrPreTrainedModel): intermediate_reference_points = () for idx, decoder_layer in enumerate(self.layers): - if reference_points.shape[-1] == 4: + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: reference_points_input = ( reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] ) - else: - if reference_points.shape[-1] != 2: - raise ValueError("Reference points' last dimension must be of size 2") + elif reference_points.shape[-1] == 2: reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Reference points' last dimension must be of size 2") if output_hidden_states: all_hidden_states += (hidden_states,) @@ -1442,17 +1444,18 @@ class DeformableDetrDecoder(DeformableDetrPreTrainedModel): # hack implementation for iterative bounding box refinement if self.bbox_embed is not None: tmp = self.bbox_embed[idx](hidden_states) - if reference_points.shape[-1] == 4: + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: new_reference_points = tmp + inverse_sigmoid(reference_points) new_reference_points = new_reference_points.sigmoid() - else: - if reference_points.shape[-1] != 2: - raise ValueError( - f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}" - ) + elif num_coordinates == 2: new_reference_points = tmp new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points) new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) reference_points = new_reference_points.detach() intermediate += (hidden_states,) diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index 35d9b67d2f..e849135559 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -682,13 +682,14 @@ class DetaMultiscaleDeformableAttention(nn.Module): batch_size, num_queries, self.n_heads, self.n_levels, self.n_points ) # batch_size, num_queries, n_heads, n_levels, n_points, 2 - if reference_points.shape[-1] == 2: + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) sampling_locations = ( reference_points[:, :, None, :, None, :] + sampling_offsets / offset_normalizer[None, None, None, :, None, :] ) - elif reference_points.shape[-1] == 4: + elif num_coordinates == 4: sampling_locations = ( reference_points[:, :, None, :, None, :2] + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 diff --git a/src/transformers/models/grounding_dino/__init__.py b/src/transformers/models/grounding_dino/__init__.py new file mode 100644 index 0000000000..3b0f792068 --- /dev/null +++ b/src/transformers/models/grounding_dino/__init__.py @@ -0,0 +1,81 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = { + "configuration_grounding_dino": [ + "GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP", + "GroundingDinoConfig", + ], + "processing_grounding_dino": ["GroundingDinoProcessor"], +} + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_grounding_dino"] = [ + "GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST", + "GroundingDinoForObjectDetection", + "GroundingDinoModel", + "GroundingDinoPreTrainedModel", + ] + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_grounding_dino"] = ["GroundingDinoImageProcessor"] + + +if TYPE_CHECKING: + from .configuration_grounding_dino import ( + GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP, + GroundingDinoConfig, + ) + from .processing_grounding_dino import GroundingDinoProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_grounding_dino import ( + GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST, + GroundingDinoForObjectDetection, + GroundingDinoModel, + GroundingDinoPreTrainedModel, + ) + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_grounding_dino import GroundingDinoImageProcessor + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__) diff --git a/src/transformers/models/grounding_dino/configuration_grounding_dino.py b/src/transformers/models/grounding_dino/configuration_grounding_dino.py new file mode 100644 index 0000000000..fe68303503 --- /dev/null +++ b/src/transformers/models/grounding_dino/configuration_grounding_dino.py @@ -0,0 +1,301 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Grounding DINO model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + +GROUNDING_DINO_PRETRAINED_CONFIG_ARCHIVE_MAP = { + "IDEA-Research/grounding-dino-tiny": "https://huggingface.co/IDEA-Research/grounding-dino-tiny/resolve/main/config.json", +} + + +class GroundingDinoConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`GroundingDinoModel`]. It is used to instantiate a + Grounding DINO model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Grounding DINO + [IDEA-Research/grounding-dino-tiny](https://huggingface.co/IDEA-Research/grounding-dino-tiny) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`): + The configuration of the backbone model. + backbone (`str`, *optional*): + Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this + will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone` + is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights. + use_pretrained_backbone (`bool`, *optional*, defaults to `False`): + Whether to use pretrained weights for the backbone. + use_timm_backbone (`bool`, *optional*, defaults to `False`): + Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers + library. + backbone_kwargs (`dict`, *optional*): + Keyword arguments to be passed to AutoBackbone when loading from a checkpoint + e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `BertConfig`): + The config object or dictionary of the text backbone. + num_queries (`int`, *optional*, defaults to 900): + Number of object queries, i.e. detection slots. This is the maximal number of objects + [`GroundingDinoModel`] can detect in a single image. + encoder_layers (`int`, *optional*, defaults to 6): + Number of encoder layers. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_layers (`int`, *optional*, defaults to 6): + Number of decoder layers. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + decoder_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer decoder. + is_encoder_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as an encoder/decoder or not. + activation_function (`str` or `function`, *optional*, defaults to `"relu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + d_model (`int`, *optional*, defaults to 256): + Dimension of the layers. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + auxiliary_loss (`bool`, *optional*, defaults to `False`): + Whether auxiliary decoding losses (loss at each decoder layer) are to be used. + position_embedding_type (`str`, *optional*, defaults to `"sine"`): + Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`. + num_feature_levels (`int`, *optional*, defaults to 4): + The number of input feature levels. + encoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the encoder. + decoder_n_points (`int`, *optional*, defaults to 4): + The number of sampled keys in each feature level for each attention head in the decoder. + two_stage (`bool`, *optional*, defaults to `True`): + Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of + Grounding DINO, which are further fed into the decoder for iterative bounding box refinement. + class_cost (`float`, *optional*, defaults to 1.0): + Relative weight of the classification error in the Hungarian matching cost. + bbox_cost (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost. + giou_cost (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost. + bbox_loss_coefficient (`float`, *optional*, defaults to 5.0): + Relative weight of the L1 bounding box loss in the object detection loss. + giou_loss_coefficient (`float`, *optional*, defaults to 2.0): + Relative weight of the generalized IoU loss in the object detection loss. + focal_alpha (`float`, *optional*, defaults to 0.25): + Alpha parameter in the focal loss. + disable_custom_kernels (`bool`, *optional*, defaults to `False`): + Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom + kernels are not supported by PyTorch ONNX export. + max_text_len (`int`, *optional*, defaults to 256): + The maximum length of the text input. + text_enhancer_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the text enhancer. + fusion_droppath (`float`, *optional*, defaults to 0.1): + The droppath ratio for the fusion module. + fusion_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the fusion module. + embedding_init_target (`bool`, *optional*, defaults to `True`): + Whether to initialize the target with Embedding weights. + query_dim (`int`, *optional*, defaults to 4): + The dimension of the query vector. + decoder_bbox_embed_share (`bool`, *optional*, defaults to `True`): + Whether to share the bbox regression head for all decoder layers. + two_stage_bbox_embed_share (`bool`, *optional*, defaults to `False`): + Whether to share the bbox embedding between the two-stage bbox generator and the region proposal + generation. + positional_embedding_temperature (`float`, *optional*, defaults to 20): + The temperature for Sine Positional Embedding that is used together with vision backbone. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + + Examples: + + ```python + >>> from transformers import GroundingDinoConfig, GroundingDinoModel + + >>> # Initializing a Grounding DINO IDEA-Research/grounding-dino-tiny style configuration + >>> configuration = GroundingDinoConfig() + + >>> # Initializing a model (with random weights) from the IDEA-Research/grounding-dino-tiny style configuration + >>> model = GroundingDinoModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "grounding-dino" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + } + + def __init__( + self, + backbone_config=None, + backbone=None, + use_pretrained_backbone=False, + use_timm_backbone=False, + backbone_kwargs=None, + text_config=None, + num_queries=900, + encoder_layers=6, + encoder_ffn_dim=2048, + encoder_attention_heads=8, + decoder_layers=6, + decoder_ffn_dim=2048, + decoder_attention_heads=8, + is_encoder_decoder=True, + activation_function="relu", + d_model=256, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + auxiliary_loss=False, + position_embedding_type="sine", + num_feature_levels=4, + encoder_n_points=4, + decoder_n_points=4, + two_stage=True, + class_cost=1.0, + bbox_cost=5.0, + giou_cost=2.0, + bbox_loss_coefficient=5.0, + giou_loss_coefficient=2.0, + focal_alpha=0.25, + disable_custom_kernels=False, + # other parameters + max_text_len=256, + text_enhancer_dropout=0.0, + fusion_droppath=0.1, + fusion_dropout=0.0, + embedding_init_target=True, + query_dim=4, + decoder_bbox_embed_share=True, + two_stage_bbox_embed_share=False, + positional_embedding_temperature=20, + init_std=0.02, + layer_norm_eps=1e-5, + **kwargs, + ): + if not use_timm_backbone and use_pretrained_backbone: + raise ValueError( + "Loading pretrained backbone weights from the transformers library is not supported yet. `use_timm_backbone` must be set to `True` when `use_pretrained_backbone=True`" + ) + + if backbone_config is not None and backbone is not None: + raise ValueError("You can't specify both `backbone` and `backbone_config`.") + + if backbone_config is None and backbone is None: + logger.info("`backbone_config` is `None`. Initializing the config with the default `Swin` backbone.") + backbone_config = CONFIG_MAPPING["swin"]( + window_size=7, + image_size=224, + embed_dim=96, + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + out_indices=[2, 3, 4], + ) + elif isinstance(backbone_config, dict): + backbone_model_type = backbone_config.pop("model_type") + config_class = CONFIG_MAPPING[backbone_model_type] + backbone_config = config_class.from_dict(backbone_config) + + if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None: + raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`BertConfig`).") + + self.backbone_config = backbone_config + self.backbone = backbone + self.use_pretrained_backbone = use_pretrained_backbone + self.use_timm_backbone = use_timm_backbone + self.backbone_kwargs = backbone_kwargs + self.num_queries = num_queries + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.auxiliary_loss = auxiliary_loss + self.position_embedding_type = position_embedding_type + # deformable attributes + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.two_stage = two_stage + # Hungarian matcher + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + # Loss coefficients + self.bbox_loss_coefficient = bbox_loss_coefficient + self.giou_loss_coefficient = giou_loss_coefficient + self.focal_alpha = focal_alpha + self.disable_custom_kernels = disable_custom_kernels + # Text backbone + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "bert" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["bert"]() + + self.text_config = text_config + self.max_text_len = max_text_len + + # Text Enhancer + self.text_enhancer_dropout = text_enhancer_dropout + # Fusion + self.fusion_droppath = fusion_droppath + self.fusion_dropout = fusion_dropout + # Others + self.embedding_init_target = embedding_init_target + self.query_dim = query_dim + self.decoder_bbox_embed_share = decoder_bbox_embed_share + self.two_stage_bbox_embed_share = two_stage_bbox_embed_share + if two_stage_bbox_embed_share and not decoder_bbox_embed_share: + raise ValueError("If two_stage_bbox_embed_share is True, decoder_bbox_embed_share must be True.") + self.positional_embedding_temperature = positional_embedding_temperature + self.init_std = init_std + self.layer_norm_eps = layer_norm_eps + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def num_attention_heads(self) -> int: + return self.encoder_attention_heads + + @property + def hidden_size(self) -> int: + return self.d_model diff --git a/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py new file mode 100644 index 0000000000..ac8e82bfd8 --- /dev/null +++ b/src/transformers/models/grounding_dino/convert_grounding_dino_to_hf.py @@ -0,0 +1,491 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Grounding DINO checkpoints from the original repository. + +URL: https://github.com/IDEA-Research/GroundingDINO""" + +import argparse + +import requests +import torch +from PIL import Image +from torchvision import transforms as T + +from transformers import ( + AutoTokenizer, + GroundingDinoConfig, + GroundingDinoForObjectDetection, + GroundingDinoImageProcessor, + GroundingDinoProcessor, + SwinConfig, +) + + +IMAGENET_MEAN = [0.485, 0.456, 0.406] +IMAGENET_STD = [0.229, 0.224, 0.225] + + +def get_grounding_dino_config(model_name): + if "tiny" in model_name: + window_size = 7 + embed_dim = 96 + depths = (2, 2, 6, 2) + num_heads = (3, 6, 12, 24) + image_size = 224 + elif "base" in model_name: + window_size = 12 + embed_dim = 128 + depths = (2, 2, 18, 2) + num_heads = (4, 8, 16, 32) + image_size = 384 + else: + raise ValueError("Model not supported, only supports base and large variants") + + backbone_config = SwinConfig( + window_size=window_size, + image_size=image_size, + embed_dim=embed_dim, + depths=depths, + num_heads=num_heads, + out_indices=[2, 3, 4], + ) + + config = GroundingDinoConfig(backbone_config=backbone_config) + + return config + + +def create_rename_keys(state_dict, config): + rename_keys = [] + # fmt: off + ########################################## VISION BACKBONE - START + # patch embedding layer + rename_keys.append(("backbone.0.patch_embed.proj.weight", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.weight")) + rename_keys.append(("backbone.0.patch_embed.proj.bias", + "model.backbone.conv_encoder.model.embeddings.patch_embeddings.projection.bias")) + rename_keys.append(("backbone.0.patch_embed.norm.weight", + "model.backbone.conv_encoder.model.embeddings.norm.weight")) + rename_keys.append(("backbone.0.patch_embed.norm.bias", + "model.backbone.conv_encoder.model.embeddings.norm.bias")) + + for layer, depth in enumerate(config.backbone_config.depths): + for block in range(depth): + # layernorms + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_before.bias")) + + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.norm2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.layernorm_after.bias")) + # attention + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.relative_position_bias_table", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.relative_position_bias_table")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.attn.proj.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.output.dense.bias")) + # intermediate + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc1.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.intermediate.dense.bias")) + + # output + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.blocks.{block}.mlp.fc2.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.output.dense.bias")) + + # downsample + if layer!=len(config.backbone_config.depths)-1: + rename_keys.append((f"backbone.0.layers.{layer}.downsample.reduction.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.reduction.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.weight", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.weight")) + rename_keys.append((f"backbone.0.layers.{layer}.downsample.norm.bias", + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.downsample.norm.bias")) + + for out_indice in config.backbone_config.out_indices: + # Grounding DINO implementation of out_indices isn't aligned with transformers + rename_keys.append((f"backbone.0.norm{out_indice-1}.weight", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.weight")) + rename_keys.append((f"backbone.0.norm{out_indice-1}.bias", + f"model.backbone.conv_encoder.model.hidden_states_norms.stage{out_indice}.bias")) + + ########################################## VISION BACKBONE - END + + ########################################## ENCODER - START + deformable_key_mappings = { + 'self_attn.sampling_offsets.weight': 'deformable_layer.self_attn.sampling_offsets.weight', + 'self_attn.sampling_offsets.bias': 'deformable_layer.self_attn.sampling_offsets.bias', + 'self_attn.attention_weights.weight': 'deformable_layer.self_attn.attention_weights.weight', + 'self_attn.attention_weights.bias': 'deformable_layer.self_attn.attention_weights.bias', + 'self_attn.value_proj.weight': 'deformable_layer.self_attn.value_proj.weight', + 'self_attn.value_proj.bias': 'deformable_layer.self_attn.value_proj.bias', + 'self_attn.output_proj.weight': 'deformable_layer.self_attn.output_proj.weight', + 'self_attn.output_proj.bias': 'deformable_layer.self_attn.output_proj.bias', + 'norm1.weight': 'deformable_layer.self_attn_layer_norm.weight', + 'norm1.bias': 'deformable_layer.self_attn_layer_norm.bias', + 'linear1.weight': 'deformable_layer.fc1.weight', + 'linear1.bias': 'deformable_layer.fc1.bias', + 'linear2.weight': 'deformable_layer.fc2.weight', + 'linear2.bias': 'deformable_layer.fc2.bias', + 'norm2.weight': 'deformable_layer.final_layer_norm.weight', + 'norm2.bias': 'deformable_layer.final_layer_norm.bias', + } + text_enhancer_key_mappings = { + 'self_attn.in_proj_weight': 'text_enhancer_layer.self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'text_enhancer_layer.self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'text_enhancer_layer.self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'text_enhancer_layer.self_attn.out_proj.bias', + 'linear1.weight': 'text_enhancer_layer.fc1.weight', + 'linear1.bias': 'text_enhancer_layer.fc1.bias', + 'linear2.weight': 'text_enhancer_layer.fc2.weight', + 'linear2.bias': 'text_enhancer_layer.fc2.bias', + 'norm1.weight': 'text_enhancer_layer.layer_norm_before.weight', + 'norm1.bias': 'text_enhancer_layer.layer_norm_before.bias', + 'norm2.weight': 'text_enhancer_layer.layer_norm_after.weight', + 'norm2.bias': 'text_enhancer_layer.layer_norm_after.bias', + } + fusion_key_mappings = { + 'gamma_v': 'fusion_layer.vision_param', + 'gamma_l': 'fusion_layer.text_param', + 'layer_norm_v.weight': 'fusion_layer.layer_norm_vision.weight', + 'layer_norm_v.bias': 'fusion_layer.layer_norm_vision.bias', + 'layer_norm_l.weight': 'fusion_layer.layer_norm_text.weight', + 'layer_norm_l.bias': 'fusion_layer.layer_norm_text.bias', + 'attn.v_proj.weight': 'fusion_layer.attn.vision_proj.weight', + 'attn.v_proj.bias': 'fusion_layer.attn.vision_proj.bias', + 'attn.l_proj.weight': 'fusion_layer.attn.text_proj.weight', + 'attn.l_proj.bias': 'fusion_layer.attn.text_proj.bias', + 'attn.values_v_proj.weight': 'fusion_layer.attn.values_vision_proj.weight', + 'attn.values_v_proj.bias': 'fusion_layer.attn.values_vision_proj.bias', + 'attn.values_l_proj.weight': 'fusion_layer.attn.values_text_proj.weight', + 'attn.values_l_proj.bias': 'fusion_layer.attn.values_text_proj.bias', + 'attn.out_v_proj.weight': 'fusion_layer.attn.out_vision_proj.weight', + 'attn.out_v_proj.bias': 'fusion_layer.attn.out_vision_proj.bias', + 'attn.out_l_proj.weight': 'fusion_layer.attn.out_text_proj.weight', + 'attn.out_l_proj.bias': 'fusion_layer.attn.out_text_proj.bias', + } + for layer in range(config.encoder_layers): + # deformable + for src, dest in deformable_key_mappings.items(): + rename_keys.append((f"transformer.encoder.layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # text enhance + for src, dest in text_enhancer_key_mappings.items(): + rename_keys.append((f"transformer.encoder.text_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + # fusion layers + for src, dest in fusion_key_mappings.items(): + rename_keys.append((f"transformer.encoder.fusion_layers.{layer}.{src}", + f"model.encoder.layers.{layer}.{dest}")) + ########################################## ENCODER - END + + ########################################## DECODER - START + key_mappings_decoder = { + 'cross_attn.sampling_offsets.weight': 'encoder_attn.sampling_offsets.weight', + 'cross_attn.sampling_offsets.bias': 'encoder_attn.sampling_offsets.bias', + 'cross_attn.attention_weights.weight': 'encoder_attn.attention_weights.weight', + 'cross_attn.attention_weights.bias': 'encoder_attn.attention_weights.bias', + 'cross_attn.value_proj.weight': 'encoder_attn.value_proj.weight', + 'cross_attn.value_proj.bias': 'encoder_attn.value_proj.bias', + 'cross_attn.output_proj.weight': 'encoder_attn.output_proj.weight', + 'cross_attn.output_proj.bias': 'encoder_attn.output_proj.bias', + 'norm1.weight': 'encoder_attn_layer_norm.weight', + 'norm1.bias': 'encoder_attn_layer_norm.bias', + 'ca_text.in_proj_weight': 'encoder_attn_text.in_proj_weight', + 'ca_text.in_proj_bias': 'encoder_attn_text.in_proj_bias', + 'ca_text.out_proj.weight': 'encoder_attn_text.out_proj.weight', + 'ca_text.out_proj.bias': 'encoder_attn_text.out_proj.bias', + 'catext_norm.weight': 'encoder_attn_text_layer_norm.weight', + 'catext_norm.bias': 'encoder_attn_text_layer_norm.bias', + 'self_attn.in_proj_weight': 'self_attn.in_proj_weight', + 'self_attn.in_proj_bias': 'self_attn.in_proj_bias', + 'self_attn.out_proj.weight': 'self_attn.out_proj.weight', + 'self_attn.out_proj.bias': 'self_attn.out_proj.bias', + 'norm2.weight': 'self_attn_layer_norm.weight', + 'norm2.bias': 'self_attn_layer_norm.bias', + 'linear1.weight': 'fc1.weight', + 'linear1.bias': 'fc1.bias', + 'linear2.weight': 'fc2.weight', + 'linear2.bias': 'fc2.bias', + 'norm3.weight': 'final_layer_norm.weight', + 'norm3.bias': 'final_layer_norm.bias', + } + for layer_num in range(config.decoder_layers): + source_prefix_decoder = f'transformer.decoder.layers.{layer_num}.' + target_prefix_decoder = f'model.decoder.layers.{layer_num}.' + + for source_name, target_name in key_mappings_decoder.items(): + rename_keys.append((source_prefix_decoder + source_name, + target_prefix_decoder + target_name)) + ########################################## DECODER - END + + ########################################## Additional - START + for layer_name, params in state_dict.items(): + #### TEXT BACKBONE + if "bert" in layer_name: + rename_keys.append((layer_name, layer_name.replace("bert", "model.text_backbone"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM VISION BACKBONE + if "input_proj" in layer_name: + rename_keys.append((layer_name, layer_name.replace("input_proj", "model.input_proj_vision"))) + #### INPUT PROJ - PROJECT OUTPUT FEATURES FROM TEXT BACKBONE + if "feat_map" in layer_name: + rename_keys.append((layer_name, layer_name.replace("feat_map", "model.text_projection"))) + #### DECODER REFERENCE POINT HEAD + if "transformer.decoder.ref_point_head" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.ref_point_head", + "model.decoder.reference_points_head"))) + #### DECODER BBOX EMBED + if "transformer.decoder.bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.decoder.bbox_embed", + "model.decoder.bbox_embed"))) + if "transformer.enc_output" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer", "model"))) + + if "transformer.enc_out_bbox_embed" in layer_name: + rename_keys.append((layer_name, layer_name.replace("transformer.enc_out_bbox_embed", + "model.encoder_output_bbox_embed"))) + + rename_keys.append(("transformer.level_embed", "model.level_embed")) + rename_keys.append(("transformer.decoder.norm.weight", "model.decoder.layer_norm.weight")) + rename_keys.append(("transformer.decoder.norm.bias", "model.decoder.layer_norm.bias")) + rename_keys.append(("transformer.tgt_embed.weight", "model.query_position_embeddings.weight")) + ########################################## Additional - END + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v_encoder(state_dict, config): + ########################################## VISION BACKBONE - START + embed_dim = config.backbone_config.embed_dim + for layer, depth in enumerate(config.backbone_config.depths): + hidden_size = embed_dim * 2**layer + for block in range(depth): + # read in weights + bias of input projection layer (in timm, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.weight") + in_proj_bias = state_dict.pop(f"backbone.0.layers.{layer}.blocks.{block}.attn.qkv.bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.weight" + ] = in_proj_weight[:hidden_size, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.query.bias" + ] = in_proj_bias[:hidden_size] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.weight" + ] = in_proj_weight[hidden_size : hidden_size * 2, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.key.bias" + ] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.weight" + ] = in_proj_weight[-hidden_size:, :] + state_dict[ + f"model.backbone.conv_encoder.model.encoder.layers.{layer}.blocks.{block}.attention.self.value.bias" + ] = in_proj_bias[-hidden_size:] + ########################################## VISION BACKBONE - END + + +def read_in_q_k_v_text_enhancer(state_dict, config): + hidden_size = config.hidden_size + for idx in range(config.encoder_layers): + # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.weight"] = in_proj_weight[ + :hidden_size, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.weight"] = in_proj_weight[ + -hidden_size:, : + ] + state_dict[f"model.encoder.layers.{idx}.text_enhancer_layer.self_attn.value.bias"] = in_proj_bias[ + -hidden_size: + ] + + +def read_in_q_k_v_decoder(state_dict, config): + hidden_size = config.hidden_size + for idx in range(config.decoder_layers): + # read in weights + bias of input projection layer (in original implementation, this is a single matrix + bias) + in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.self_attn.in_proj_bias") + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{idx}.self_attn.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"model.decoder.layers.{idx}.self_attn.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.decoder.layers.{idx}.self_attn.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.decoder.layers.{idx}.self_attn.key.bias"] = in_proj_bias[hidden_size : hidden_size * 2] + + state_dict[f"model.decoder.layers.{idx}.self_attn.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"model.decoder.layers.{idx}.self_attn.value.bias"] = in_proj_bias[-hidden_size:] + + # read in weights + bias of cross-attention + in_proj_weight = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_weight") + in_proj_bias = state_dict.pop(f"model.decoder.layers.{idx}.encoder_attn_text.in_proj_bias") + + # next, add query, keys and values (in that order) to the state dict + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.weight"] = in_proj_weight[:hidden_size, :] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.query.bias"] = in_proj_bias[:hidden_size] + + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.weight"] = in_proj_weight[ + hidden_size : hidden_size * 2, : + ] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.key.bias"] = in_proj_bias[ + hidden_size : hidden_size * 2 + ] + + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.weight"] = in_proj_weight[-hidden_size:, :] + state_dict[f"model.decoder.layers.{idx}.encoder_attn_text.value.bias"] = in_proj_bias[-hidden_size:] + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + return image + + +def preprocess_caption(caption: str) -> str: + result = caption.lower().strip() + if result.endswith("."): + return result + return result + "." + + +@torch.no_grad() +def convert_grounding_dino_checkpoint(args): + model_name = args.model_name + pytorch_dump_folder_path = args.pytorch_dump_folder_path + push_to_hub = args.push_to_hub + verify_logits = args.verify_logits + + checkpoint_mapping = { + "grounding-dino-tiny": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swint_ogc.pth", + "grounding-dino-base": "https://huggingface.co/ShilongLiu/GroundingDino/resolve/main/groundingdino_swinb_cogcoor.pth", + } + # Define default GroundingDino configuation + config = get_grounding_dino_config(model_name) + + # Load original checkpoint + checkpoint_url = checkpoint_mapping[model_name] + original_state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu")["model"] + original_state_dict = {k.replace("module.", ""): v for k, v in original_state_dict.items()} + + for name, param in original_state_dict.items(): + print(name, param.shape) + + # Rename keys + new_state_dict = original_state_dict.copy() + rename_keys = create_rename_keys(original_state_dict, config) + + for src, dest in rename_keys: + rename_key(new_state_dict, src, dest) + read_in_q_k_v_encoder(new_state_dict, config) + read_in_q_k_v_text_enhancer(new_state_dict, config) + read_in_q_k_v_decoder(new_state_dict, config) + + # Load HF model + model = GroundingDinoForObjectDetection(config) + model.eval() + missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False) + print("Missing keys:", missing_keys) + print("Unexpected keys:", unexpected_keys) + + # Load and process test image + image = prepare_img() + transforms = T.Compose([T.Resize(size=800, max_size=1333), T.ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)]) + original_pixel_values = transforms(image).unsqueeze(0) + + image_processor = GroundingDinoImageProcessor() + tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased") + processor = GroundingDinoProcessor(image_processor=image_processor, tokenizer=tokenizer) + + text = "a cat" + inputs = processor(images=image, text=preprocess_caption(text), return_tensors="pt") + + assert torch.allclose(original_pixel_values, inputs.pixel_values, atol=1e-4) + + if verify_logits: + # Running forward + with torch.no_grad(): + outputs = model(**inputs) + + print(outputs.logits[0, :3, :3]) + + expected_slice = torch.tensor( + [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] + ) + + assert torch.allclose(outputs.logits[0, :3, :3], expected_slice, atol=1e-4) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model.push_to_hub(f"EduardoPacheco/{model_name}") + processor.push_to_hub(f"EduardoPacheco/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="grounding-dino-tiny", + type=str, + choices=["grounding-dino-tiny", "grounding-dino-base"], + help="Name of the GroundingDino model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + parser.add_argument( + "--verify_logits", action="store_false", help="Whether or not to verify logits after conversion." + ) + + args = parser.parse_args() + convert_grounding_dino_checkpoint(args) diff --git a/src/transformers/models/grounding_dino/image_processing_grounding_dino.py b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py new file mode 100644 index 0000000000..8b39d6801c --- /dev/null +++ b/src/transformers/models/grounding_dino/image_processing_grounding_dino.py @@ -0,0 +1,1511 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Deformable DETR.""" + +import io +import pathlib +from collections import defaultdict +from typing import Any, Callable, Dict, Iterable, List, Optional, Set, Tuple, Union + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...image_processing_utils import BaseImageProcessor, get_size_dict +from ...image_transforms import ( + PaddingMode, + center_to_corners_format, + corners_to_center_format, + id_to_rgb, + pad, + rescale, + resize, + rgb_to_id, + to_channel_dimension_format, +) +from ...image_utils import ( + IMAGENET_DEFAULT_MEAN, + IMAGENET_DEFAULT_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_annotations, + validate_kwargs, + validate_preprocess_arguments, +) +from ...utils import ( + ExplicitEnum, + TensorType, + is_flax_available, + is_jax_tensor, + is_scipy_available, + is_tf_available, + is_tf_tensor, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) + + +if is_torch_available(): + import torch + from torch import nn + + +if is_vision_available(): + import PIL + +if is_scipy_available(): + import scipy.special + import scipy.stats + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +AnnotationType = Dict[str, Union[int, str, List[Dict]]] + + +class AnnotationFormat(ExplicitEnum): + COCO_DETECTION = "coco_detection" + COCO_PANOPTIC = "coco_panoptic" + + +SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC) + + +# Copied from transformers.models.detr.image_processing_detr.get_size_with_aspect_ratio +def get_size_with_aspect_ratio(image_size, size, max_size=None) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. + + Args: + image_size (`Tuple[int, int]`): + The input image size. + size (`int`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + """ + height, width = image_size + if max_size is not None: + min_original_size = float(min((height, width))) + max_original_size = float(max((height, width))) + if max_original_size / min_original_size * size > max_size: + size = int(round(max_size * min_original_size / max_original_size)) + + if (height <= width and height == size) or (width <= height and width == size): + return height, width + + if width < height: + ow = size + oh = int(size * height / width) + else: + oh = size + ow = int(size * width / height) + return (oh, ow) + + +# Copied from transformers.models.detr.image_processing_detr.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + size: Union[int, Tuple[int, int], List[int]], + max_size: Optional[int] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + """ + Computes the output image size given the input image size and the desired output size. If the desired output size + is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output + image size is computed by keeping the aspect ratio of the input image size. + + Args: + input_image (`np.ndarray`): + The image to resize. + size (`int` or `Tuple[int, int]` or `List[int]`): + The desired output size. + max_size (`int`, *optional*): + The maximum allowed output size. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input image. + """ + image_size = get_image_size(input_image, input_data_format) + if isinstance(size, (list, tuple)): + return size + + return get_size_with_aspect_ratio(image_size, size, max_size) + + +# Copied from transformers.models.detr.image_processing_detr.get_numpy_to_framework_fn +def get_numpy_to_framework_fn(arr) -> Callable: + """ + Returns a function that converts a numpy array to the framework of the input array. + + Args: + arr (`np.ndarray`): The array to convert. + """ + if isinstance(arr, np.ndarray): + return np.array + if is_tf_available() and is_tf_tensor(arr): + import tensorflow as tf + + return tf.convert_to_tensor + if is_torch_available() and is_torch_tensor(arr): + import torch + + return torch.tensor + if is_flax_available() and is_jax_tensor(arr): + import jax.numpy as jnp + + return jnp.array + raise ValueError(f"Cannot convert arrays of type {type(arr)}") + + +# Copied from transformers.models.detr.image_processing_detr.safe_squeeze +def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray: + """ + Squeezes an array, but only if the axis specified has dim 1. + """ + if axis is None: + return arr.squeeze() + + try: + return arr.squeeze(axis=axis) + except ValueError: + return arr + + +# Copied from transformers.models.detr.image_processing_detr.normalize_annotation +def normalize_annotation(annotation: Dict, image_size: Tuple[int, int]) -> Dict: + image_height, image_width = image_size + norm_annotation = {} + for key, value in annotation.items(): + if key == "boxes": + boxes = value + boxes = corners_to_center_format(boxes) + boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32) + norm_annotation[key] = boxes + else: + norm_annotation[key] = value + return norm_annotation + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.detr.image_processing_detr.get_max_height_width +def get_max_height_width( + images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.detr.image_processing_detr.convert_coco_poly_to_mask +def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray: + """ + Convert a COCO polygon annotation to a mask. + + Args: + segmentations (`List[List[float]]`): + List of polygons, each polygon represented by a list of x-y coordinates. + height (`int`): + Height of the mask. + width (`int`): + Width of the mask. + """ + try: + from pycocotools import mask as coco_mask + except ImportError: + raise ImportError("Pycocotools is not installed in your environment.") + + masks = [] + for polygons in segmentations: + rles = coco_mask.frPyObjects(polygons, height, width) + mask = coco_mask.decode(rles) + if len(mask.shape) < 3: + mask = mask[..., None] + mask = np.asarray(mask, dtype=np.uint8) + mask = np.any(mask, axis=2) + masks.append(mask) + if masks: + masks = np.stack(masks, axis=0) + else: + masks = np.zeros((0, height, width), dtype=np.uint8) + + return masks + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_detection_annotation with DETR->GroundingDino +def prepare_coco_detection_annotation( + image, + target, + return_segmentation_masks: bool = False, + input_data_format: Optional[Union[ChannelDimension, str]] = None, +): + """ + Convert the target in COCO format into the format expected by GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + + image_id = target["image_id"] + image_id = np.asarray([image_id], dtype=np.int64) + + # Get all COCO annotations for the given image. + annotations = target["annotations"] + annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0] + + classes = [obj["category_id"] for obj in annotations] + classes = np.asarray(classes, dtype=np.int64) + + # for conversion to coco api + area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32) + iscrowd = np.asarray([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in annotations], dtype=np.int64) + + boxes = [obj["bbox"] for obj in annotations] + # guard against no boxes via resizing + boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4) + boxes[:, 2:] += boxes[:, :2] + boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width) + boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height) + + keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0]) + + new_target = {} + new_target["image_id"] = image_id + new_target["class_labels"] = classes[keep] + new_target["boxes"] = boxes[keep] + new_target["area"] = area[keep] + new_target["iscrowd"] = iscrowd[keep] + new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64) + + if annotations and "keypoints" in annotations[0]: + keypoints = [obj["keypoints"] for obj in annotations] + # Converting the filtered keypoints list to a numpy array + keypoints = np.asarray(keypoints, dtype=np.float32) + # Apply the keep mask here to filter the relevant annotations + keypoints = keypoints[keep] + num_keypoints = keypoints.shape[0] + keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints + new_target["keypoints"] = keypoints + + if return_segmentation_masks: + segmentation_masks = [obj["segmentation"] for obj in annotations] + masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width) + new_target["masks"] = masks[keep] + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.masks_to_boxes +def masks_to_boxes(masks: np.ndarray) -> np.ndarray: + """ + Compute the bounding boxes around the provided panoptic segmentation masks. + + Args: + masks: masks in format `[number_masks, height, width]` where N is the number of masks + + Returns: + boxes: bounding boxes in format `[number_masks, 4]` in xyxy format + """ + if masks.size == 0: + return np.zeros((0, 4)) + + h, w = masks.shape[-2:] + y = np.arange(0, h, dtype=np.float32) + x = np.arange(0, w, dtype=np.float32) + # see https://github.com/pytorch/pytorch/issues/50276 + y, x = np.meshgrid(y, x, indexing="ij") + + x_mask = masks * np.expand_dims(x, axis=0) + x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1) + x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool))) + x_min = x.filled(fill_value=1e8) + x_min = x_min.reshape(x_min.shape[0], -1).min(-1) + + y_mask = masks * np.expand_dims(y, axis=0) + y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1) + y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool))) + y_min = y.filled(fill_value=1e8) + y_min = y_min.reshape(y_min.shape[0], -1).min(-1) + + return np.stack([x_min, y_min, x_max, y_max], 1) + + +# Copied from transformers.models.detr.image_processing_detr.prepare_coco_panoptic_annotation with DETR->GroundingDino +def prepare_coco_panoptic_annotation( + image: np.ndarray, + target: Dict, + masks_path: Union[str, pathlib.Path], + return_masks: bool = True, + input_data_format: Union[ChannelDimension, str] = None, +) -> Dict: + """ + Prepare a coco panoptic annotation for GroundingDino. + """ + image_height, image_width = get_image_size(image, channel_dim=input_data_format) + annotation_path = pathlib.Path(masks_path) / target["file_name"] + + new_target = {} + new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64) + new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64) + new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64) + + if "segments_info" in target: + masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32) + masks = rgb_to_id(masks) + + ids = np.array([segment_info["id"] for segment_info in target["segments_info"]]) + masks = masks == ids[:, None, None] + masks = masks.astype(np.uint8) + if return_masks: + new_target["masks"] = masks + new_target["boxes"] = masks_to_boxes(masks) + new_target["class_labels"] = np.array( + [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["iscrowd"] = np.asarray( + [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64 + ) + new_target["area"] = np.asarray( + [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32 + ) + + return new_target + + +# Copied from transformers.models.detr.image_processing_detr.get_segmentation_image +def get_segmentation_image( + masks: np.ndarray, input_size: Tuple, target_size: Tuple, stuff_equiv_classes, deduplicate=False +): + h, w = input_size + final_h, final_w = target_size + + m_id = scipy.special.softmax(masks.transpose(0, 1), -1) + + if m_id.shape[-1] == 0: + # We didn't detect any mask :( + m_id = np.zeros((h, w), dtype=np.int64) + else: + m_id = m_id.argmax(-1).reshape(h, w) + + if deduplicate: + # Merge the masks corresponding to the same stuff class + for equiv in stuff_equiv_classes.values(): + for eq_id in equiv: + m_id[m_id == eq_id] = equiv[0] + + seg_img = id_to_rgb(m_id) + seg_img = resize(seg_img, (final_w, final_h), resample=PILImageResampling.NEAREST) + return seg_img + + +# Copied from transformers.models.detr.image_processing_detr.get_mask_area +def get_mask_area(seg_img: np.ndarray, target_size: Tuple[int, int], n_classes: int) -> np.ndarray: + final_h, final_w = target_size + np_seg_img = seg_img.astype(np.uint8) + np_seg_img = np_seg_img.reshape(final_h, final_w, 3) + m_id = rgb_to_id(np_seg_img) + area = [(m_id == i).sum() for i in range(n_classes)] + return area + + +# Copied from transformers.models.detr.image_processing_detr.score_labels_from_class_probabilities +def score_labels_from_class_probabilities(logits: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + probs = scipy.special.softmax(logits, axis=-1) + labels = probs.argmax(-1, keepdims=True) + scores = np.take_along_axis(probs, labels, axis=-1) + scores, labels = scores.squeeze(-1), labels.squeeze(-1) + return scores, labels + + +# Copied from transformers.models.detr.image_processing_detr.post_process_panoptic_sample +def post_process_panoptic_sample( + out_logits: np.ndarray, + masks: np.ndarray, + boxes: np.ndarray, + processed_size: Tuple[int, int], + target_size: Tuple[int, int], + is_thing_map: Dict, + threshold=0.85, +) -> Dict: + """ + Converts the output of [`DetrForSegmentation`] into panoptic segmentation predictions for a single sample. + + Args: + out_logits (`torch.Tensor`): + The logits for this sample. + masks (`torch.Tensor`): + The predicted segmentation masks for this sample. + boxes (`torch.Tensor`): + The prediced bounding boxes for this sample. The boxes are in the normalized format `(center_x, center_y, + width, height)` and values between `[0, 1]`, relative to the size the image (disregarding padding). + processed_size (`Tuple[int, int]`): + The processed size of the image `(height, width)`, as returned by the preprocessing step i.e. the size + after data augmentation but before batching. + target_size (`Tuple[int, int]`): + The target size of the image, `(height, width)` corresponding to the requested final size of the + prediction. + is_thing_map (`Dict`): + A dictionary mapping class indices to a boolean value indicating whether the class is a thing or not. + threshold (`float`, *optional*, defaults to 0.85): + The threshold used to binarize the segmentation masks. + """ + # we filter empty queries and detection below threshold + scores, labels = score_labels_from_class_probabilities(out_logits) + keep = (labels != out_logits.shape[-1] - 1) & (scores > threshold) + + cur_scores = scores[keep] + cur_classes = labels[keep] + cur_boxes = center_to_corners_format(boxes[keep]) + + if len(cur_boxes) != len(cur_classes): + raise ValueError("Not as many boxes as there are classes") + + cur_masks = masks[keep] + cur_masks = resize(cur_masks[:, None], processed_size, resample=PILImageResampling.BILINEAR) + cur_masks = safe_squeeze(cur_masks, 1) + b, h, w = cur_masks.shape + + # It may be that we have several predicted masks for the same stuff class. + # In the following, we track the list of masks ids for each stuff class (they are merged later on) + cur_masks = cur_masks.reshape(b, -1) + stuff_equiv_classes = defaultdict(list) + for k, label in enumerate(cur_classes): + if not is_thing_map[label]: + stuff_equiv_classes[label].append(k) + + seg_img = get_segmentation_image(cur_masks, processed_size, target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(cur_masks, processed_size, n_classes=len(cur_scores)) + + # We filter out any mask that is too small + if cur_classes.size() > 0: + # We know filter empty masks as long as we find some + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + while filtered_small.any(): + cur_masks = cur_masks[~filtered_small] + cur_scores = cur_scores[~filtered_small] + cur_classes = cur_classes[~filtered_small] + seg_img = get_segmentation_image(cur_masks, (h, w), target_size, stuff_equiv_classes, deduplicate=True) + area = get_mask_area(seg_img, target_size, n_classes=len(cur_scores)) + filtered_small = np.array([a <= 4 for a in area], dtype=bool) + else: + cur_classes = np.ones((1, 1), dtype=np.int64) + + segments_info = [ + {"id": i, "isthing": is_thing_map[cat], "category_id": int(cat), "area": a} + for i, (cat, a) in enumerate(zip(cur_classes, area)) + ] + del cur_classes + + with io.BytesIO() as out: + PIL.Image.fromarray(seg_img).save(out, format="PNG") + predictions = {"png_string": out.getvalue(), "segments_info": segments_info} + + return predictions + + +# Copied from transformers.models.detr.image_processing_detr.resize_annotation +def resize_annotation( + annotation: Dict[str, Any], + orig_size: Tuple[int, int], + target_size: Tuple[int, int], + threshold: float = 0.5, + resample: PILImageResampling = PILImageResampling.NEAREST, +): + """ + Resizes an annotation to a target size. + + Args: + annotation (`Dict[str, Any]`): + The annotation dictionary. + orig_size (`Tuple[int, int]`): + The original size of the input image. + target_size (`Tuple[int, int]`): + The target size of the image, as returned by the preprocessing `resize` step. + threshold (`float`, *optional*, defaults to 0.5): + The threshold used to binarize the segmentation masks. + resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`): + The resampling filter to use when resizing the masks. + """ + ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size)) + ratio_height, ratio_width = ratios + + new_annotation = {} + new_annotation["size"] = target_size + + for key, value in annotation.items(): + if key == "boxes": + boxes = value + scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32) + new_annotation["boxes"] = scaled_boxes + elif key == "area": + area = value + scaled_area = area * (ratio_width * ratio_height) + new_annotation["area"] = scaled_area + elif key == "masks": + masks = value[:, None] + masks = np.array([resize(mask, target_size, resample=resample) for mask in masks]) + masks = masks.astype(np.float32) + masks = masks[:, 0] > threshold + new_annotation["masks"] = masks + elif key == "size": + new_annotation["size"] = target_size + else: + new_annotation[key] = value + + return new_annotation + + +# Copied from transformers.models.detr.image_processing_detr.binary_mask_to_rle +def binary_mask_to_rle(mask): + """ + Converts given binary mask of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + mask (`torch.Tensor` or `numpy.array`): + A binary mask tensor of shape `(height, width)` where 0 denotes background and 1 denotes the target + segment_id or class_id. + Returns: + `List`: Run-length encoded list of the binary mask. Refer to COCO API for more information about the RLE + format. + """ + if is_torch_tensor(mask): + mask = mask.numpy() + + pixels = mask.flatten() + pixels = np.concatenate([[0], pixels, [0]]) + runs = np.where(pixels[1:] != pixels[:-1])[0] + 1 + runs[1::2] -= runs[::2] + return list(runs) + + +# Copied from transformers.models.detr.image_processing_detr.convert_segmentation_to_rle +def convert_segmentation_to_rle(segmentation): + """ + Converts given segmentation map of shape `(height, width)` to the run-length encoding (RLE) format. + + Args: + segmentation (`torch.Tensor` or `numpy.array`): + A segmentation map of shape `(height, width)` where each value denotes a segment or class id. + Returns: + `List[List]`: A list of lists, where each list is the run-length encoding of a segment / class id. + """ + segment_ids = torch.unique(segmentation) + + run_length_encodings = [] + for idx in segment_ids: + mask = torch.where(segmentation == idx, 1, 0) + rle = binary_mask_to_rle(mask) + run_length_encodings.append(rle) + + return run_length_encodings + + +# Copied from transformers.models.detr.image_processing_detr.remove_low_and_no_objects +def remove_low_and_no_objects(masks, scores, labels, object_mask_threshold, num_labels): + """ + Binarize the given masks using `object_mask_threshold`, it returns the associated values of `masks`, `scores` and + `labels`. + + Args: + masks (`torch.Tensor`): + A tensor of shape `(num_queries, height, width)`. + scores (`torch.Tensor`): + A tensor of shape `(num_queries)`. + labels (`torch.Tensor`): + A tensor of shape `(num_queries)`. + object_mask_threshold (`float`): + A number between 0 and 1 used to binarize the masks. + Raises: + `ValueError`: Raised when the first dimension doesn't match in all input tensors. + Returns: + `Tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`]`: The `masks`, `scores` and `labels` without the region + < `object_mask_threshold`. + """ + if not (masks.shape[0] == scores.shape[0] == labels.shape[0]): + raise ValueError("mask, scores and labels must have the same shape!") + + to_keep = labels.ne(num_labels) & (scores > object_mask_threshold) + + return masks[to_keep], scores[to_keep], labels[to_keep] + + +# Copied from transformers.models.detr.image_processing_detr.check_segment_validity +def check_segment_validity(mask_labels, mask_probs, k, mask_threshold=0.5, overlap_mask_area_threshold=0.8): + # Get the mask associated with the k class + mask_k = mask_labels == k + mask_k_area = mask_k.sum() + + # Compute the area of all the stuff in query k + original_area = (mask_probs[k] >= mask_threshold).sum() + mask_exists = mask_k_area > 0 and original_area > 0 + + # Eliminate disconnected tiny segments + if mask_exists: + area_ratio = mask_k_area / original_area + if not area_ratio.item() > overlap_mask_area_threshold: + mask_exists = False + + return mask_exists, mask_k + + +# Copied from transformers.models.detr.image_processing_detr.compute_segments +def compute_segments( + mask_probs, + pred_scores, + pred_labels, + mask_threshold: float = 0.5, + overlap_mask_area_threshold: float = 0.8, + label_ids_to_fuse: Optional[Set[int]] = None, + target_size: Tuple[int, int] = None, +): + height = mask_probs.shape[1] if target_size is None else target_size[0] + width = mask_probs.shape[2] if target_size is None else target_size[1] + + segmentation = torch.zeros((height, width), dtype=torch.int32, device=mask_probs.device) + segments: List[Dict] = [] + + if target_size is not None: + mask_probs = nn.functional.interpolate( + mask_probs.unsqueeze(0), size=target_size, mode="bilinear", align_corners=False + )[0] + + current_segment_id = 0 + + # Weigh each mask by its prediction score + mask_probs *= pred_scores.view(-1, 1, 1) + mask_labels = mask_probs.argmax(0) # [height, width] + + # Keep track of instances of each class + stuff_memory_list: Dict[str, int] = {} + for k in range(pred_labels.shape[0]): + pred_class = pred_labels[k].item() + should_fuse = pred_class in label_ids_to_fuse + + # Check if mask exists and large enough to be a segment + mask_exists, mask_k = check_segment_validity( + mask_labels, mask_probs, k, mask_threshold, overlap_mask_area_threshold + ) + + if mask_exists: + if pred_class in stuff_memory_list: + current_segment_id = stuff_memory_list[pred_class] + else: + current_segment_id += 1 + + # Add current object segment to final segmentation map + segmentation[mask_k] = current_segment_id + segment_score = round(pred_scores[k].item(), 6) + segments.append( + { + "id": current_segment_id, + "label_id": pred_class, + "was_fused": should_fuse, + "score": segment_score, + } + ) + if should_fuse: + stuff_memory_list[pred_class] = current_segment_id + + return segmentation, segments + + +class GroundingDinoImageProcessor(BaseImageProcessor): + r""" + Constructs a Grounding DINO image processor. + + Args: + format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`): + Data format of the annotations. One of "coco_detection" or "coco_panoptic". + do_resize (`bool`, *optional*, defaults to `True`): + Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be + overridden by the `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`): + Size of the image's (height, width) dimensions after resizing. Can be overridden by the `size` parameter in + the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. Controls whether to normalize the image. Can be overridden by the `do_normalize` + parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`): + Mean values to use when normalizing the image. Can be a single value or a list of values, one for each + channel. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`): + Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one + for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_annotations (`bool`, *optional*, defaults to `True`): + Controls whether to convert the annotations to the format expected by the DETR model. Converts the + bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`. + Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Controls whether to pad the image to the largest image in a batch and create a pixel mask. Can be + overridden by the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values", "pixel_mask"] + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.__init__ + def __init__( + self, + format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Union[float, List[float]] = None, + image_std: Union[float, List[float]] = None, + do_convert_annotations: Optional[bool] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None if size is None else 1333 + + size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333} + size = get_size_dict(size, max_size=max_size, default_to_square=False) + + # Backwards compatibility + if do_convert_annotations is None: + do_convert_annotations = do_normalize + + super().__init__(**kwargs) + self.format = format + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.do_convert_annotations = do_convert_annotations + self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD + self.do_pad = do_pad + self._valid_processor_keys = [ + "images", + "annotations", + "return_segmentation_masks", + "masks_path", + "do_resize", + "size", + "resample", + "do_rescale", + "rescale_factor", + "do_normalize", + "do_convert_annotations", + "image_mean", + "image_std", + "do_pad", + "format", + "return_tensors", + "data_format", + "input_data_format", + ] + + @classmethod + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.from_dict with Detr->GroundingDino + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to make sure parameters are updated if image processor is + created using from_dict and kwargs e.g. `GroundingDinoImageProcessor.from_pretrained(checkpoint, size=600, + max_size=800)` + """ + image_processor_dict = image_processor_dict.copy() + if "max_size" in kwargs: + image_processor_dict["max_size"] = kwargs.pop("max_size") + if "pad_and_return_pixel_mask" in kwargs: + image_processor_dict["pad_and_return_pixel_mask"] = kwargs.pop("pad_and_return_pixel_mask") + return super().from_dict(image_processor_dict, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_annotation with DETR->GroundingDino + def prepare_annotation( + self, + image: np.ndarray, + target: Dict, + format: Optional[AnnotationFormat] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> Dict: + """ + Prepare an annotation for feeding into GroundingDino model. + """ + format = format if format is not None else self.format + + if format == AnnotationFormat.COCO_DETECTION: + return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_detection_annotation( + image, target, return_segmentation_masks, input_data_format=input_data_format + ) + elif format == AnnotationFormat.COCO_PANOPTIC: + return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks + target = prepare_coco_panoptic_annotation( + image, + target, + masks_path=masks_path, + return_masks=return_segmentation_masks, + input_data_format=input_data_format, + ) + else: + raise ValueError(f"Format {format} is not supported.") + return target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare + def prepare(self, image, target, return_segmentation_masks=None, masks_path=None): + logger.warning_once( + "The `prepare` method is deprecated and will be removed in a v4.33. " + "Please use `prepare_annotation` instead. Note: the `prepare_annotation` method " + "does not return the image anymore.", + ) + target = self.prepare_annotation(image, target, return_segmentation_masks, masks_path, self.format) + return image, target + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.convert_coco_poly_to_mask + def convert_coco_poly_to_mask(self, *args, **kwargs): + logger.warning_once("The `convert_coco_poly_to_mask` method is deprecated and will be removed in v4.33. ") + return convert_coco_poly_to_mask(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_detection + def prepare_coco_detection(self, *args, **kwargs): + logger.warning_once("The `prepare_coco_detection` method is deprecated and will be removed in v4.33. ") + return prepare_coco_detection_annotation(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.prepare_coco_panoptic + def prepare_coco_panoptic(self, *args, **kwargs): + logger.warning_once("The `prepare_coco_panoptic` method is deprecated and will be removed in v4.33. ") + return prepare_coco_panoptic_annotation(*args, **kwargs) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an + int, smaller edge of the image will be matched to this number. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary containing the size to resize to. Can contain the keys `shortest_edge` and `longest_edge` or + `height` and `width`. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`): + Resampling filter to use if resizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` parameter is deprecated and will be removed in v4.26. " + "Please specify in `size['longest_edge'] instead`.", + ) + max_size = kwargs.pop("max_size") + else: + max_size = None + size = get_size_dict(size, max_size=max_size, default_to_square=False) + if "shortest_edge" in size and "longest_edge" in size: + size = get_resize_output_image_size( + image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format + ) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError( + "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got" + f" {size.keys()}." + ) + image = resize( + image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + return image + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.resize_annotation + def resize_annotation( + self, + annotation, + orig_size, + size, + resample: PILImageResampling = PILImageResampling.NEAREST, + ) -> Dict: + """ + Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched + to this number. + """ + return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.rescale + def rescale( + self, + image: np.ndarray, + rescale_factor: float, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Rescale the image by the given factor. image = image * rescale_factor. + + Args: + image (`np.ndarray`): + Image to rescale. + rescale_factor (`float`): + The value to use for rescaling. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. If unset, is inferred from the input image. Can be + one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + """ + return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.normalize_annotation + def normalize_annotation(self, annotation: Dict, image_size: Tuple[int, int]) -> Dict: + """ + Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to + `[center_x, center_y, width, height]` format and from absolute to relative pixel values. + """ + return normalize_annotation(annotation, image_size=image_size) + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._update_annotation_for_padded_image + def _update_annotation_for_padded_image( + self, + annotation: Dict, + input_image_size: Tuple[int, int], + output_image_size: Tuple[int, int], + padding, + update_bboxes, + ) -> Dict: + """ + Update the annotation for a padded image. + """ + new_annotation = {} + new_annotation["size"] = output_image_size + + for key, value in annotation.items(): + if key == "masks": + masks = value + masks = pad( + masks, + padding, + mode=PaddingMode.CONSTANT, + constant_values=0, + input_data_format=ChannelDimension.FIRST, + ) + masks = safe_squeeze(masks, 1) + new_annotation["masks"] = masks + elif key == "boxes" and update_bboxes: + boxes = value + boxes *= np.asarray( + [ + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + input_image_size[1] / output_image_size[1], + input_image_size[0] / output_image_size[0], + ] + ) + new_annotation["boxes"] = boxes + elif key == "size": + new_annotation["size"] = output_image_size + else: + new_annotation[key] = value + return new_annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + annotation: Optional[Dict[str, Any]] = None, + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + if annotation is not None: + annotation = self._update_annotation_for_padded_image( + annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes + ) + return padded_image, annotation + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.pad + def pad( + self, + images: List[np.ndarray], + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + update_bboxes: bool = True, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + images (List[`np.ndarray`]): + Images to pad. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + Annotations to transform according to the padding that is applied to the images. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + update_bboxes (`bool`, *optional*, defaults to `True`): + Whether to update the bounding boxes in the annotations to match the padded images. If the + bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)` + format, the bounding boxes will not be updated. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + annotation_list = annotations if annotations is not None else [None] * len(images) + padded_images = [] + padded_annotations = [] + for image, annotation in zip(images, annotation_list): + padded_image, padded_annotation = self._pad_image( + image, + pad_size, + annotation, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=update_bboxes, + ) + padded_images.append(padded_image) + padded_annotations.append(padded_annotation) + + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors) + + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations + ] + + return encoded_inputs + + # Copied from transformers.models.detr.image_processing_detr.DetrImageProcessor.preprocess + def preprocess( + self, + images: ImageInput, + annotations: Optional[Union[AnnotationType, List[AnnotationType]]] = None, + return_segmentation_masks: bool = None, + masks_path: Optional[Union[str, pathlib.Path]] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample=None, # PILImageResampling + do_rescale: Optional[bool] = None, + rescale_factor: Optional[Union[int, float]] = None, + do_normalize: Optional[bool] = None, + do_convert_annotations: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + format: Optional[Union[str, AnnotationFormat]] = None, + return_tensors: Optional[Union[TensorType, str]] = None, + data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> BatchFeature: + """ + Preprocess an image or a batch of images so that it can be used by the model. + + Args: + images (`ImageInput`): + Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging + from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`. + annotations (`AnnotationType` or `List[AnnotationType]`, *optional*): + List of annotations associated with the image or batch of images. If annotation is for object + detection, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "annotations" (`List[Dict]`): List of annotations for an image. Each annotation should be a + dictionary. An image can have no annotations, in which case the list should be empty. + If annotation is for segmentation, the annotations should be a dictionary with the following keys: + - "image_id" (`int`): The image id. + - "segments_info" (`List[Dict]`): List of segments for an image. Each segment should be a dictionary. + An image can have no segments, in which case the list should be empty. + - "file_name" (`str`): The file name of the image. + return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks): + Whether to return segmentation masks. + masks_path (`str` or `pathlib.Path`, *optional*): + Path to the directory containing the segmentation masks. + do_resize (`bool`, *optional*, defaults to self.do_resize): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to self.size): + Size of the image after resizing. + resample (`PILImageResampling`, *optional*, defaults to self.resample): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to self.do_rescale): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to self.rescale_factor): + Rescale factor to use when rescaling the image. + do_normalize (`bool`, *optional*, defaults to self.do_normalize): + Whether to normalize the image. + do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations): + Whether to convert the annotations to the format expected by the model. Converts the bounding + boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)` + and in relative coordinates. + image_mean (`float` or `List[float]`, *optional*, defaults to self.image_mean): + Mean to use when normalizing the image. + image_std (`float` or `List[float]`, *optional*, defaults to self.image_std): + Standard deviation to use when normalizing the image. + do_pad (`bool`, *optional*, defaults to self.do_pad): + Whether to pad the image. If `True` will pad the images in the batch to the largest image in the batch + and create a pixel mask. Padding will be applied to the bottom and right of the image with zeros. + format (`str` or `AnnotationFormat`, *optional*, defaults to self.format): + Format of the annotations. + return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors): + Type of tensors to return. If `None`, will return the list of images. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + if "pad_and_return_pixel_mask" in kwargs: + logger.warning_once( + "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, " + "use `do_pad` instead." + ) + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + max_size = None + if "max_size" in kwargs: + logger.warning_once( + "The `max_size` argument is deprecated and will be removed in a future version, use" + " `size['longest_edge']` instead." + ) + size = kwargs.pop("max_size") + + do_resize = self.do_resize if do_resize is None else do_resize + size = self.size if size is None else size + size = get_size_dict(size=size, max_size=max_size, default_to_square=False) + resample = self.resample if resample is None else resample + do_rescale = self.do_rescale if do_rescale is None else do_rescale + rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor + do_normalize = self.do_normalize if do_normalize is None else do_normalize + image_mean = self.image_mean if image_mean is None else image_mean + image_std = self.image_std if image_std is None else image_std + do_convert_annotations = ( + self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations + ) + do_pad = self.do_pad if do_pad is None else do_pad + format = self.format if format is None else format + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys) + + # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if annotations is not None and isinstance(annotations, dict): + annotations = [annotations] + + if annotations is not None and len(images) != len(annotations): + raise ValueError( + f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match." + ) + + format = AnnotationFormat(format) + if annotations is not None: + validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations) + + if ( + masks_path is not None + and format == AnnotationFormat.COCO_PANOPTIC + and not isinstance(masks_path, (pathlib.Path, str)) + ): + raise ValueError( + "The path to the directory containing the mask PNG files should be provided as a" + f" `pathlib.Path` or string object, but is {type(masks_path)} instead." + ) + + # All transformations expect numpy arrays + images = [to_numpy_array(image) for image in images] + + if is_scaled_image(images[0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image) + if annotations is not None: + prepared_images = [] + prepared_annotations = [] + for image, target in zip(images, annotations): + target = self.prepare_annotation( + image, + target, + format, + return_segmentation_masks=return_segmentation_masks, + masks_path=masks_path, + input_data_format=input_data_format, + ) + prepared_images.append(image) + prepared_annotations.append(target) + images = prepared_images + annotations = prepared_annotations + del prepared_images, prepared_annotations + + # transformations + if do_resize: + if annotations is not None: + resized_images, resized_annotations = [], [] + for image, target in zip(images, annotations): + orig_size = get_image_size(image, input_data_format) + resized_image = self.resize( + image, size=size, max_size=max_size, resample=resample, input_data_format=input_data_format + ) + resized_annotation = self.resize_annotation( + target, orig_size, get_image_size(resized_image, input_data_format) + ) + resized_images.append(resized_image) + resized_annotations.append(resized_annotation) + images = resized_images + annotations = resized_annotations + del resized_images, resized_annotations + else: + images = [ + self.resize(image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images] + + if do_normalize: + images = [ + self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images + ] + + if do_convert_annotations and annotations is not None: + annotations = [ + self.normalize_annotation(annotation, get_image_size(image, input_data_format)) + for annotation, image in zip(annotations, images) + ] + + if do_pad: + # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...} + encoded_inputs = self.pad( + images, + annotations=annotations, + return_pixel_mask=True, + data_format=data_format, + input_data_format=input_data_format, + update_bboxes=do_convert_annotations, + return_tensors=return_tensors, + ) + else: + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + if annotations is not None: + encoded_inputs["labels"] = [ + BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations + ] + + return encoded_inputs + + # Copied from transformers.models.owlvit.image_processing_owlvit.OwlViTImageProcessor.post_process_object_detection with OwlViT->GroundingDino + def post_process_object_detection( + self, outputs, threshold: float = 0.1, target_sizes: Union[TensorType, List[Tuple]] = None + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + threshold (`float`, *optional*): + Score threshold to keep object detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + # TODO: (amy) add support for other frameworks + logits, boxes = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + probs = torch.max(logits, dim=-1) + scores = torch.sigmoid(probs.values) + labels = probs.indices + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for s, l, b in zip(scores, labels, boxes): + score = s[s > threshold] + label = l[s > threshold] + box = b[s > threshold] + results.append({"scores": score, "labels": label, "boxes": box}) + + return results diff --git a/src/transformers/models/grounding_dino/modeling_grounding_dino.py b/src/transformers/models/grounding_dino/modeling_grounding_dino.py new file mode 100644 index 0000000000..7f9149de91 --- /dev/null +++ b/src/transformers/models/grounding_dino/modeling_grounding_dino.py @@ -0,0 +1,3132 @@ +# coding=utf-8 +# Copyright 2024 IDEA Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" PyTorch Grounding DINO model.""" + +import copy +import math +import os +import warnings +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +from torch import Tensor, nn +from torch.autograd import Function +from torch.autograd.function import once_differentiable + +from ...activations import ACT2FN +from ...file_utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_scipy_available, + is_timm_available, + is_torch_cuda_available, + is_vision_available, + replace_return_docstrings, + requires_backends, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import meshgrid +from ...utils import is_accelerate_available, is_ninja_available, logging +from ...utils.backbone_utils import load_backbone +from ..auto import AutoModel +from .configuration_grounding_dino import GroundingDinoConfig + + +if is_vision_available(): + from transformers.image_transforms import center_to_corners_format + +if is_accelerate_available(): + from accelerate import PartialState + from accelerate.utils import reduce + +if is_scipy_available(): + from scipy.optimize import linear_sum_assignment + +if is_timm_available(): + from timm import create_model + + +logger = logging.get_logger(__name__) + +MultiScaleDeformableAttention = None + + +# Copied from models.deformable_detr.load_cuda_kernels +def load_cuda_kernels(): + from torch.utils.cpp_extension import load + + global MultiScaleDeformableAttention + + root = Path(__file__).resolve().parent.parent.parent / "kernels" / "grounding_dino" + src_files = [ + root / filename + for filename in [ + "vision.cpp", + os.path.join("cpu", "ms_deform_attn_cpu.cpp"), + os.path.join("cuda", "ms_deform_attn_cuda.cu"), + ] + ] + + MultiScaleDeformableAttention = load( + "MultiScaleDeformableAttention", + src_files, + with_cuda=True, + extra_include_paths=[str(root)], + extra_cflags=["-DWITH_CUDA=1"], + extra_cuda_cflags=[ + "-DCUDA_HAS_FP16=1", + "-D__CUDA_NO_HALF_OPERATORS__", + "-D__CUDA_NO_HALF_CONVERSIONS__", + "-D__CUDA_NO_HALF2_OPERATORS__", + ], + ) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttentionFunction +class MultiScaleDeformableAttentionFunction(Function): + @staticmethod + def forward( + context, + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + im2col_step, + ): + context.im2col_step = im2col_step + output = MultiScaleDeformableAttention.ms_deform_attn_forward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + context.im2col_step, + ) + context.save_for_backward( + value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights + ) + return output + + @staticmethod + @once_differentiable + def backward(context, grad_output): + ( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + ) = context.saved_tensors + grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward( + value, + value_spatial_shapes, + value_level_start_index, + sampling_locations, + attention_weights, + grad_output, + context.im2col_step, + ) + + return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "GroundingDinoConfig" +_CHECKPOINT_FOR_DOC = "IDEA-Research/grounding-dino-tiny" + +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "IDEA-Research/grounding-dino-tiny", + # See all Grounding DINO models at https://huggingface.co/models?filter=grounding-dino +] + + +@dataclass +class GroundingDinoDecoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDinoDecoder. This class adds two attributes to + BaseModelOutputWithCrossAttentions, namely: + - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer) + - a stacked tensor of intermediate reference points. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer + plus the initial embedding outputs. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. + """ + + last_hidden_state: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + + +@dataclass +class GroundingDinoEncoderOutput(ModelOutput): + """ + Base class for outputs of the GroundingDinoEncoder. This class extends BaseModelOutput, due to: + - vision and text last hidden states + - vision and text intermediate hidden states + + Args: + last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the vision encoder. + last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the text encoder. + vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. + """ + + last_hidden_state_vision: torch.FloatTensor = None + last_hidden_state_text: torch.FloatTensor = None + vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + + +@dataclass +class GroundingDinoModelOutput(ModelOutput): + """ + Base class for outputs of the Grounding DINO encoder-decoder model. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. attention softmax, used to compute the weighted average in the + bi-attention heads. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + last_hidden_state: torch.FloatTensor = None + init_reference_points: torch.FloatTensor = None + intermediate_hidden_states: torch.FloatTensor = None + intermediate_reference_points: torch.FloatTensor = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +@dataclass +class GroundingDinoObjectDetectionOutput(ModelOutput): + """ + Output type of [`GroundingDinoForObjectDetection`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): + Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a + bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized + scale-invariant IoU loss. + loss_dict (`Dict`, *optional*): + A dictionary containing the individual losses. Useful for logging. + logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`): + Classification logits (including no-object) for all queries. + pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These + values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding + possible padding). You can use [`~GroundingDinoProcessor.post_process_object_detection`] to retrieve the + unnormalized bounding boxes. + auxiliary_outputs (`List[Dict]`, *optional*): + Optional, only returned when auxilary losses are activated (i.e. `config.auxiliary_loss` is set to `True`) + and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and + `pred_boxes`) for each decoder layer. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer + plus the initial embedding outputs. + decoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the self-attention, cross-attention and multi-scale deformable attention heads. + encoder_last_hidden_state_vision (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_last_hidden_state_text (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_vision_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the vision embeddings + one for the output of each + layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the vision encoder at the + output of each layer plus the initial embedding outputs. + encoder_text_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the text embeddings + one for the output of each layer) + of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the text encoder at the output of + each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of tuples of `torch.FloatTensor` (one for attention for each layer) of shape `(batch_size, num_heads, + sequence_length, sequence_length)`. Attentions weights after the attention softmax, used to compute the + weighted average in the text-vision attention, vision-text attention, text-enhancer (self-attention) and + multi-scale deformable attention heads. + intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`): + Stacked intermediate hidden states (output of each layer of the decoder). + intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`): + Stacked intermediate reference points (reference points of each layer of the decoder). + init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`): + Initial reference points sent through the Transformer decoder. + enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.two_stage=True`): + Predicted bounding boxes scores where the top `config.num_queries` scoring bounding boxes are picked as + region proposals in the first stage. Output of bounding box binary classification (i.e. foreground and + background). + enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`): + Logits of predicted bounding boxes coordinates in the first stage. + """ + + loss: Optional[torch.FloatTensor] = None + loss_dict: Optional[Dict] = None + logits: torch.FloatTensor = None + pred_boxes: torch.FloatTensor = None + auxiliary_outputs: Optional[List[Dict]] = None + last_hidden_state: Optional[torch.FloatTensor] = None + init_reference_points: Optional[torch.FloatTensor] = None + intermediate_hidden_states: Optional[torch.FloatTensor] = None + intermediate_reference_points: Optional[torch.FloatTensor] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + encoder_last_hidden_state_vision: Optional[torch.FloatTensor] = None + encoder_last_hidden_state_text: Optional[torch.FloatTensor] = None + encoder_vision_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_text_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + enc_outputs_class: Optional[torch.FloatTensor] = None + enc_outputs_coord_logits: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->GroundingDino +class GroundingDinoFrozenBatchNorm2d(nn.Module): + """ + BatchNorm2d where the batch statistics and the affine parameters are fixed. + + Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than + torchvision.models.resnet[18,34,50,101] produce nans. + """ + + def __init__(self, n): + super().__init__() + self.register_buffer("weight", torch.ones(n)) + self.register_buffer("bias", torch.zeros(n)) + self.register_buffer("running_mean", torch.zeros(n)) + self.register_buffer("running_var", torch.ones(n)) + + def _load_from_state_dict( + self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ): + num_batches_tracked_key = prefix + "num_batches_tracked" + if num_batches_tracked_key in state_dict: + del state_dict[num_batches_tracked_key] + + super()._load_from_state_dict( + state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs + ) + + def forward(self, x): + # move reshapes to the beginning + # to make it user-friendly + weight = self.weight.reshape(1, -1, 1, 1) + bias = self.bias.reshape(1, -1, 1, 1) + running_var = self.running_var.reshape(1, -1, 1, 1) + running_mean = self.running_mean.reshape(1, -1, 1, 1) + epsilon = 1e-5 + scale = weight * (running_var + epsilon).rsqrt() + bias = bias - running_mean * scale + return x * scale + bias + + +# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->GroundingDino +def replace_batch_norm(model): + r""" + Recursively replace all `torch.nn.BatchNorm2d` with `GroundingDinoFrozenBatchNorm2d`. + + Args: + model (torch.nn.Module): + input model + """ + for name, module in model.named_children(): + if isinstance(module, nn.BatchNorm2d): + new_module = GroundingDinoFrozenBatchNorm2d(module.num_features) + + if not module.weight.device == torch.device("meta"): + new_module.weight.data.copy_(module.weight) + new_module.bias.data.copy_(module.bias) + new_module.running_mean.data.copy_(module.running_mean) + new_module.running_var.data.copy_(module.running_var) + + model._modules[name] = new_module + + if len(list(module.children())) > 0: + replace_batch_norm(module) + + +class GroundingDinoConvEncoder(nn.Module): + """ + Convolutional backbone, using either the AutoBackbone API or one from the timm library. + + nn.BatchNorm2d layers are replaced by GroundingDinoFrozenBatchNorm2d as defined above. + + """ + + def __init__(self, config): + super().__init__() + + self.config = config + + if config.use_timm_backbone: + requires_backends(self, ["timm"]) + backbone = create_model( + config.backbone, + pretrained=config.use_pretrained_backbone, + features_only=True, + **config.backbone_kwargs, + ) + else: + backbone = load_backbone(config) + + # replace batch norm by frozen batch norm + with torch.no_grad(): + replace_batch_norm(backbone) + self.model = backbone + self.intermediate_channel_sizes = ( + self.model.feature_info.channels() if config.use_timm_backbone else self.model.channels + ) + + backbone_model_type = config.backbone if config.use_timm_backbone else config.backbone_config.model_type + if "resnet" in backbone_model_type: + for name, parameter in self.model.named_parameters(): + if config.use_timm_backbone: + if "layer2" not in name and "layer3" not in name and "layer4" not in name: + parameter.requires_grad_(False) + else: + if "stage.1" not in name and "stage.2" not in name and "stage.3" not in name: + parameter.requires_grad_(False) + + # Copied from transformers.models.detr.modeling_detr.DetrConvEncoder.forward with Detr->GroundingDino + def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor): + # send pixel_values through the model to get list of feature maps + features = self.model(pixel_values) if self.config.use_timm_backbone else self.model(pixel_values).feature_maps + + out = [] + for feature_map in features: + # downsample pixel_mask to match shape of corresponding feature_map + mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0] + out.append((feature_map, mask)) + return out + + +# Copied from transformers.models.detr.modeling_detr.DetrConvModel with Detr->GroundingDino +class GroundingDinoConvModel(nn.Module): + """ + This module adds 2D position embeddings to all intermediate feature maps of the convolutional encoder. + """ + + def __init__(self, conv_encoder, position_embedding): + super().__init__() + self.conv_encoder = conv_encoder + self.position_embedding = position_embedding + + def forward(self, pixel_values, pixel_mask): + # send pixel_values and pixel_mask through backbone to get list of (feature_map, pixel_mask) tuples + out = self.conv_encoder(pixel_values, pixel_mask) + pos = [] + for feature_map, mask in out: + # position encoding + pos.append(self.position_embedding(feature_map, mask).to(feature_map.dtype)) + + return out, pos + + +class GroundingDinoSinePositionEmbedding(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one used by the Attention is all you + need paper, generalized to work on images. + """ + + def __init__(self, config): + super().__init__() + self.embedding_dim = config.d_model // 2 + self.temperature = config.positional_embedding_temperature + self.scale = 2 * math.pi + + def forward(self, pixel_values, pixel_mask): + y_embed = pixel_mask.cumsum(1, dtype=torch.float32) + x_embed = pixel_mask.cumsum(2, dtype=torch.float32) + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.embedding_dim, dtype=torch.float32, device=pixel_values.device) + dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + return pos + + +class GroundingDinoLearnedPositionEmbedding(nn.Module): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, config): + super().__init__() + + embedding_dim = config.d_model // 2 + self.row_embeddings = nn.Embedding(50, embedding_dim) + self.column_embeddings = nn.Embedding(50, embedding_dim) + + def forward(self, pixel_values, pixel_mask=None): + height, width = pixel_values.shape[-2:] + width_values = torch.arange(width, device=pixel_values.device) + height_values = torch.arange(height, device=pixel_values.device) + x_emb = self.column_embeddings(width_values) + y_emb = self.row_embeddings(height_values) + pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1) + pos = pos.permute(2, 0, 1) + pos = pos.unsqueeze(0) + pos = pos.repeat(pixel_values.shape[0], 1, 1, 1) + return pos + + +def build_position_encoding(config): + if config.position_embedding_type == "sine": + position_embedding = GroundingDinoSinePositionEmbedding(config) + elif config.position_embedding_type == "learned": + position_embedding = GroundingDinoLearnedPositionEmbedding(config) + else: + raise ValueError(f"Not supported {config.position_embedding_type}") + + return position_embedding + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.multi_scale_deformable_attention +def multi_scale_deformable_attention( + value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor +) -> Tensor: + batch_size, _, num_heads, hidden_dim = value.shape + _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape + value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1) + sampling_grids = 2 * sampling_locations - 1 + sampling_value_list = [] + for level_id, (height, width) in enumerate(value_spatial_shapes): + # batch_size, height*width, num_heads, hidden_dim + # -> batch_size, height*width, num_heads*hidden_dim + # -> batch_size, num_heads*hidden_dim, height*width + # -> batch_size*num_heads, hidden_dim, height, width + value_l_ = ( + value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width) + ) + # batch_size, num_queries, num_heads, num_points, 2 + # -> batch_size, num_heads, num_queries, num_points, 2 + # -> batch_size*num_heads, num_queries, num_points, 2 + sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1) + # batch_size*num_heads, hidden_dim, num_queries, num_points + sampling_value_l_ = nn.functional.grid_sample( + value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False + ) + sampling_value_list.append(sampling_value_l_) + # (batch_size, num_queries, num_heads, num_levels, num_points) + # -> (batch_size, num_heads, num_queries, num_levels, num_points) + # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points) + attention_weights = attention_weights.transpose(1, 2).reshape( + batch_size * num_heads, 1, num_queries, num_levels * num_points + ) + output = ( + (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights) + .sum(-1) + .view(batch_size, num_heads * hidden_dim, num_queries) + ) + return output.transpose(1, 2).contiguous() + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->GroundingDino, Deformable DETR->Grounding DINO +class GroundingDinoMultiscaleDeformableAttention(nn.Module): + """ + Multiscale deformable attention as proposed in Deformable DETR. + """ + + def __init__(self, config: GroundingDinoConfig, num_heads: int, n_points: int): + super().__init__() + + kernel_loaded = MultiScaleDeformableAttention is not None + if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded: + try: + load_cuda_kernels() + except Exception as e: + logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}") + + if config.d_model % num_heads != 0: + raise ValueError( + f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}" + ) + dim_per_head = config.d_model // num_heads + # check if dim_per_head is power of 2 + if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0): + warnings.warn( + "You'd better set embed_dim (d_model) in GroundingDinoMultiscaleDeformableAttention to make the" + " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA" + " implementation." + ) + + self.im2col_step = 64 + + self.d_model = config.d_model + self.n_levels = config.num_feature_levels + self.n_heads = num_heads + self.n_points = n_points + + self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2) + self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points) + self.value_proj = nn.Linear(config.d_model, config.d_model) + self.output_proj = nn.Linear(config.d_model, config.d_model) + + self.disable_custom_kernels = config.disable_custom_kernels + + self._reset_parameters() + + def _reset_parameters(self): + nn.init.constant_(self.sampling_offsets.weight.data, 0.0) + default_dtype = torch.get_default_dtype() + thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads) + grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) + grid_init = ( + (grid_init / grid_init.abs().max(-1, keepdim=True)[0]) + .view(self.n_heads, 1, 1, 2) + .repeat(1, self.n_levels, self.n_points, 1) + ) + for i in range(self.n_points): + grid_init[:, :, i, :] *= i + 1 + with torch.no_grad(): + self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1)) + nn.init.constant_(self.attention_weights.weight.data, 0.0) + nn.init.constant_(self.attention_weights.bias.data, 0.0) + nn.init.xavier_uniform_(self.value_proj.weight.data) + nn.init.constant_(self.value_proj.bias.data, 0.0) + nn.init.xavier_uniform_(self.output_proj.weight.data) + nn.init.constant_(self.output_proj.bias.data, 0.0) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states=None, + encoder_attention_mask=None, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + # add position embeddings to the hidden states before projecting to queries and keys + if position_embeddings is not None: + hidden_states = self.with_pos_embed(hidden_states, position_embeddings) + + batch_size, num_queries, _ = hidden_states.shape + batch_size, sequence_length, _ = encoder_hidden_states.shape + if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length: + raise ValueError( + "Make sure to align the spatial shapes with the sequence length of the encoder hidden states" + ) + + value = self.value_proj(encoder_hidden_states) + if attention_mask is not None: + # we invert the attention_mask + value = value.masked_fill(~attention_mask[..., None], float(0)) + value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads) + sampling_offsets = self.sampling_offsets(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2 + ) + attention_weights = self.attention_weights(hidden_states).view( + batch_size, num_queries, self.n_heads, self.n_levels * self.n_points + ) + attention_weights = F.softmax(attention_weights, -1).view( + batch_size, num_queries, self.n_heads, self.n_levels, self.n_points + ) + # batch_size, num_queries, n_heads, n_levels, n_points, 2 + num_coordinates = reference_points.shape[-1] + if num_coordinates == 2: + offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1) + sampling_locations = ( + reference_points[:, :, None, :, None, :] + + sampling_offsets / offset_normalizer[None, None, None, :, None, :] + ) + elif num_coordinates == 4: + sampling_locations = ( + reference_points[:, :, None, :, None, :2] + + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5 + ) + else: + raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + + if self.disable_custom_kernels: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + else: + try: + # custom kernel + output = MultiScaleDeformableAttentionFunction.apply( + value, + spatial_shapes, + level_start_index, + sampling_locations, + attention_weights, + self.im2col_step, + ) + except Exception: + # PyTorch implementation + output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights) + output = self.output_proj(output) + + return output, attention_weights + + +class GroundingDinoTextEnhancerLayer(nn.Module): + """Vanilla Transformer with text embeddings as input""" + + def __init__(self, config): + super().__init__() + self.self_attn = GroundingDinoMultiheadAttention( + config, num_attention_heads=config.encoder_attention_heads // 2 + ) + + # Implementation of Feedforward model + self.fc1 = nn.Linear(config.d_model, config.encoder_ffn_dim // 2) + self.fc2 = nn.Linear(config.encoder_ffn_dim // 2, config.d_model) + + self.layer_norm_before = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_after = nn.LayerNorm(config.d_model, config.layer_norm_eps) + + self.activation = ACT2FN[config.activation_function] + self.num_heads = config.encoder_attention_heads // 2 + self.dropout = config.text_enhancer_dropout + + def with_pos_embed(self, hidden_state: Tensor, position_embeddings: Optional[Tensor]): + return hidden_state if position_embeddings is None else hidden_state + position_embeddings + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_masks: Optional[torch.BoolTensor] = None, + position_embeddings: Optional[torch.FloatTensor] = None, + ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + """Text self-attention to enhance projection of text features generated by + the text encoder (AutoModel based on text_config) within GroundingDinoEncoderLayer + + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`): + Text features generated by the text encoder. + attention_masks (`torch.BoolTensor`, *optional*): + Attention mask for text self-attention. False for real tokens and True for padding tokens. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings to be added to the hidden states. + + Returns: + `tuple(torch.FloatTensor)` comprising two elements: + - **hidden_states** (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`) -- + Output of the text self-attention layer. + - **attention_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, sequence_length, + sequence_length)`) -- + Attention weights of the text self-attention layer. + """ + + # repeat attn mask + if attention_masks.dim() == 3 and attention_masks.shape[0] == hidden_states.shape[0]: + # batch_size, num_queries, num_keys + attention_masks = attention_masks[:, None, :, :] + attention_masks = attention_masks.repeat(1, self.num_heads, 1, 1) + + dtype = torch.float16 + attention_masks = attention_masks.to(dtype=dtype) # fp16 compatibility + attention_masks = (1.0 - attention_masks) * torch.finfo(dtype).min + + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + attention_output, attention_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=attention_masks, + output_attentions=True, + ) + attention_output = nn.functional.dropout(attention_output, p=self.dropout, training=self.training) + hidden_states = hidden_states + attention_output + hidden_states = self.layer_norm_before(hidden_states) + + residual = hidden_states + hidden_states = self.activation(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = hidden_states + residual + hidden_states = self.layer_norm_after(hidden_states) + + return hidden_states, attention_weights + + +class GroundingDinoBiMultiHeadAttention(nn.Module): + def __init__(self, config): + super().__init__() + + vision_dim = text_dim = config.d_model + embed_dim = config.encoder_ffn_dim // 2 + num_heads = config.encoder_attention_heads // 2 + dropout = config.fusion_dropout + + self.embed_dim = embed_dim + self.num_heads = num_heads + self.head_dim = embed_dim // num_heads + self.vision_dim = vision_dim + self.text_dim = text_dim + + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"`embed_dim` must be divisible by `num_heads` (got `embed_dim`: {self.embed_dim} and `num_heads`: {self.num_heads})." + ) + self.scale = self.head_dim ** (-0.5) + self.dropout = dropout + + self.vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.text_proj = nn.Linear(self.text_dim, self.embed_dim) + self.values_vision_proj = nn.Linear(self.vision_dim, self.embed_dim) + self.values_text_proj = nn.Linear(self.text_dim, self.embed_dim) + + self.out_vision_proj = nn.Linear(self.embed_dim, self.vision_dim) + self.out_text_proj = nn.Linear(self.embed_dim, self.text_dim) + + def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int): + return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + vision_attention_mask: Optional[torch.BoolTensor] = None, + text_attention_mask: Optional[torch.BoolTensor] = None, + ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image-to-text and text-to-image cross-attention + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + vision_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + text_attention_mask (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an attention + output and weights: + - **vision_attn_output** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_din)`) + -- + Output of the image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_attn_output** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`) -- + Output of the text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + batch_size, tgt_len, _ = vision_features.size() + + vision_query_states = self.vision_proj(vision_features) * self.scale + vision_query_states = self._reshape(vision_query_states, tgt_len, batch_size) + + text_key_states = self.text_proj(text_features) + text_key_states = self._reshape(text_key_states, -1, batch_size) + + vision_value_states = self.values_vision_proj(vision_features) + vision_value_states = self._reshape(vision_value_states, -1, batch_size) + + text_value_states = self.values_text_proj(text_features) + text_value_states = self._reshape(text_value_states, -1, batch_size) + + proj_shape = (batch_size * self.num_heads, -1, self.head_dim) + + vision_query_states = vision_query_states.view(*proj_shape) + text_key_states = text_key_states.view(*proj_shape) + vision_value_states = vision_value_states.view(*proj_shape) + text_value_states = text_value_states.view(*proj_shape) + + src_len = text_key_states.size(1) + attn_weights = torch.bmm(vision_query_states, text_key_states.transpose(1, 2)) # bs*nhead, nimg, ntxt + + if attn_weights.size() != (batch_size * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(batch_size * self.num_heads, tgt_len, src_len)}, but is {attn_weights.size()}" + ) + + attn_weights = attn_weights - attn_weights.max() + # Do not increase -50000/50000, data type half has quite limited range + attn_weights = torch.clamp(attn_weights, min=-50000, max=50000) + + attn_weights_transposed = attn_weights.transpose(1, 2) + text_attn_weights = attn_weights_transposed - torch.max(attn_weights_transposed, dim=-1, keepdim=True)[0] + + # Do not increase -50000/50000, data type half has quite limited range + text_attn_weights = torch.clamp(text_attn_weights, min=-50000, max=50000) + + # mask vision for language + if vision_attention_mask is not None: + vision_attention_mask = ( + vision_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + ) + text_attn_weights.masked_fill_(vision_attention_mask, float("-inf")) + + text_attn_weights = text_attn_weights.softmax(dim=-1) + + # mask language for vision + if text_attention_mask is not None: + text_attention_mask = text_attention_mask[:, None, None, :].repeat(1, self.num_heads, 1, 1).flatten(0, 1) + attn_weights.masked_fill_(text_attention_mask, float("-inf")) + vision_attn_weights = attn_weights.softmax(dim=-1) + + vision_attn_probs = F.dropout(vision_attn_weights, p=self.dropout, training=self.training) + text_attn_probs = F.dropout(text_attn_weights, p=self.dropout, training=self.training) + + vision_attn_output = torch.bmm(vision_attn_probs, text_value_states) + text_attn_output = torch.bmm(text_attn_probs, vision_value_states) + + if vision_attn_output.size() != (batch_size * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`vision_attn_output` should be of size {(batch_size, self.num_heads, tgt_len, self.head_dim)}, but is {vision_attn_output.size()}" + ) + + if text_attn_output.size() != (batch_size * self.num_heads, src_len, self.head_dim): + raise ValueError( + f"`text_attn_output` should be of size {(batch_size, self.num_heads, src_len, self.head_dim)}, but is {text_attn_output.size()}" + ) + + vision_attn_output = vision_attn_output.view(batch_size, self.num_heads, tgt_len, self.head_dim) + vision_attn_output = vision_attn_output.transpose(1, 2) + vision_attn_output = vision_attn_output.reshape(batch_size, tgt_len, self.embed_dim) + + text_attn_output = text_attn_output.view(batch_size, self.num_heads, src_len, self.head_dim) + text_attn_output = text_attn_output.transpose(1, 2) + text_attn_output = text_attn_output.reshape(batch_size, src_len, self.embed_dim) + + vision_attn_output = self.out_vision_proj(vision_attn_output) + text_attn_output = self.out_text_proj(text_attn_output) + + return (vision_attn_output, vision_attn_weights), (text_attn_output, text_attn_weights) + + +# Copied from transformers.models.beit.modeling_beit.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->GroundingDino +class GroundingDinoDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +class GroundingDinoFusionLayer(nn.Module): + def __init__(self, config): + super().__init__() + drop_path = config.fusion_droppath + + # pre layer norm + self.layer_norm_vision = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layer_norm_text = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.attn = GroundingDinoBiMultiHeadAttention(config) + + # add layer scale for training stability + self.drop_path = GroundingDinoDropPath(drop_path) if drop_path > 0.0 else nn.Identity() + init_values = 1e-4 + self.vision_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + self.text_param = nn.Parameter(init_values * torch.ones((config.d_model)), requires_grad=True) + + def forward( + self, + vision_features: torch.FloatTensor, + text_features: torch.FloatTensor, + attention_mask_vision: Optional[torch.BoolTensor] = None, + attention_mask_text: Optional[torch.BoolTensor] = None, + ) -> Tuple[Tuple[torch.FloatTensor, torch.FloatTensor], Tuple[torch.FloatTensor, torch.FloatTensor]]: + """Image and text features fusion + + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, hidden_dim)`): + Projected flattened image features generated by the vision backbone. + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_dim)`): + Projected text features generated by the text encoder. + attention_mask_vision (`torch.BoolTensor`, **optional**): + Attention mask for image-to-text cross-attention. False for real tokens and True for padding tokens. + attention_mask_text (`torch.BoolTensor`, **optional**): + Attention mask for text-to-image cross-attention. False for real tokens and True for padding tokens. + + Returns: + `tuple(tuple(torch.FloatTensor), tuple(torch.FloatTensor))` where each inner tuple comprises an enhanced + feature and attention output and weights: + - **vision_features** (`torch.FloatTensor` of shape `(batch_size, vision_sequence_length, vision_dim)`) -- + Updated vision features with attention output from image-to-text cross-attention layer. + - **vision_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, vision_sequence_length, + vision_sequence_length)`) -- + Attention weights of the image-to-text cross-attention layer. + - **text_features** (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, text_dim)`) -- + Updated text features with attention output from text-to-image cross-attention layer. + - **text_attn_weights** (`torch.FloatTensor` of shape `(batch_size, num_heads, text_sequence_length, + text_sequence_length)`) -- + Attention weights of the text-to-image cross-attention layer. + """ + vision_features = self.layer_norm_vision(vision_features) + text_features = self.layer_norm_text(text_features) + (delta_v, vision_attn), (delta_t, text_attn) = self.attn( + vision_features, + text_features, + vision_attention_mask=attention_mask_vision, + text_attention_mask=attention_mask_text, + ) + vision_features = vision_features + self.drop_path(self.vision_param * delta_v) + text_features = text_features + self.drop_path(self.text_param * delta_t) + + return (vision_features, vision_attn), (text_features, text_attn) + + +class GroundingDinoDeformableLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = GroundingDinoMultiscaleDeformableAttention( + config, num_heads=config.encoder_attention_heads, n_points=config.encoder_n_points + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + position_embeddings: torch.Tensor = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Input to the layer. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Attention mask. + position_embeddings (`torch.FloatTensor`, *optional*): + Position embeddings, to be added to `hidden_states`. + reference_points (`torch.FloatTensor`, *optional*): + Reference points. + spatial_shapes (`torch.LongTensor`, *optional*): + Spatial shapes of the backbone feature maps. + level_start_index (`torch.LongTensor`, *optional*): + Level start index. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps. + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=hidden_states, + encoder_attention_mask=attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if self.training: + if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any(): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + return hidden_states, attn_weights + + +# Based on https://github.com/IDEA-Research/GroundingDINO/blob/2b62f419c292ca9c518daae55512fabc3fead4a4/groundingdino/models/GroundingDINO/utils.py#L24 +def get_sine_pos_embed( + pos_tensor: torch.Tensor, num_pos_feats: int = 128, temperature: int = 10000, exchange_xy: bool = True +) -> Tensor: + """ + Generate sine position embeddings from a position tensor. + + Args: + pos_tensor (torch.Tensor): + Tensor containing positions. Shape: [..., n]. + num_pos_feats (`int`, *optional*, defaults to 128): + Projected shape for each float in the tensor. + temperature (`int`, *optional*, defaults to 10000): + Temperature in the sine/cosine function. + exchange_xy (`bool`, *optional*, defaults to `True`): + Exchange pos x and pos y. For example, input tensor is [x,y], the results will be [pos(y), pos(x)]. + + Returns: + position_embeddings (torch.Tensor): shape: [..., n * hidden_size]. + """ + scale = 2 * math.pi + dim_t = torch.arange(num_pos_feats, dtype=torch.float32, device=pos_tensor.device) + dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats) + + def sine_func(x: torch.Tensor): + sin_x = x * scale / dim_t + sin_x = torch.stack((sin_x[..., 0::2].sin(), sin_x[..., 1::2].cos()), dim=3).flatten(2) + return sin_x + + pos_tensor = pos_tensor.split([1] * pos_tensor.shape[-1], dim=-1) + position_embeddings = [sine_func(x) for x in pos_tensor] + if exchange_xy: + position_embeddings[0], position_embeddings[1] = position_embeddings[1], position_embeddings[0] + position_embeddings = torch.cat(position_embeddings, dim=-1) + return position_embeddings + + +class GroundingDinoEncoderLayer(nn.Module): + def __init__(self, config) -> None: + super().__init__() + + self.d_model = config.d_model + + self.text_enhancer_layer = GroundingDinoTextEnhancerLayer(config) + self.fusion_layer = GroundingDinoFusionLayer(config) + self.deformable_layer = GroundingDinoDeformableLayer(config) + + def get_text_position_embeddings( + self, + text_features: Tensor, + text_position_embedding: Optional[torch.Tensor], + text_position_ids: Optional[torch.Tensor], + ) -> Tensor: + batch_size, seq_length, _ = text_features.shape + if text_position_embedding is None and text_position_ids is None: + text_position_embedding = torch.arange(seq_length, device=text_features.device) + text_position_embedding = text_position_embedding.float() + text_position_embedding = text_position_embedding.unsqueeze(0).unsqueeze(-1) + text_position_embedding = text_position_embedding.repeat(batch_size, 1, 1) + text_position_embedding = get_sine_pos_embed( + text_position_embedding, num_pos_feats=self.d_model, exchange_xy=False + ) + if text_position_ids is not None: + text_position_embedding = get_sine_pos_embed( + text_position_ids[..., None], num_pos_feats=self.d_model, exchange_xy=False + ) + + return text_position_embedding + + def forward( + self, + vision_features: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + key_padding_mask: Tensor, + reference_points: Tensor, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + ): + text_position_embedding = self.get_text_position_embeddings( + text_features, text_position_embedding, text_position_ids + ) + + (vision_features, vision_fused_attn), (text_features, text_fused_attn) = self.fusion_layer( + vision_features=vision_features, + text_features=text_features, + attention_mask_vision=key_padding_mask, + attention_mask_text=text_attention_mask, + ) + + (text_features, text_enhanced_attn) = self.text_enhancer_layer( + hidden_states=text_features, + attention_masks=~text_self_attention_masks, # note we use ~ for mask here + position_embeddings=(text_position_embedding if text_position_embedding is not None else None), + ) + + (vision_features, vision_deformable_attn) = self.deformable_layer( + hidden_states=vision_features, + attention_mask=~key_padding_mask, + position_embeddings=vision_position_embedding, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + ) + + return ( + (vision_features, text_features), + (vision_fused_attn, text_fused_attn, text_enhanced_attn, vision_deformable_attn), + ) + + +class GroundingDinoMultiheadAttention(nn.Module): + """Equivalent implementation of nn.MultiheadAttention with `batch_first=True`.""" + + def __init__(self, config, num_attention_heads=None): + super().__init__() + if config.hidden_size % num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({num_attention_heads})" + ) + + self.num_attention_heads = num_attention_heads + self.attention_head_size = int(config.hidden_size / num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) + + self.dropout = nn.Dropout(config.attention_dropout) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + queries: torch.Tensor, + keys: torch.Tensor, + values: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + query_layer = self.transpose_for_scores(self.query(queries)) + key_layer = self.transpose_for_scores(self.key(keys)) + value_layer = self.transpose_for_scores(self.value(values)) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in GroundingDinoModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + context_layer = self.out_proj(context_layer) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class GroundingDinoDecoderLayer(nn.Module): + def __init__(self, config: GroundingDinoConfig): + super().__init__() + self.embed_dim = config.d_model + + # self-attention + self.self_attn = GroundingDinoMultiheadAttention(config, num_attention_heads=config.decoder_attention_heads) + + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention text + self.encoder_attn_text = GroundingDinoMultiheadAttention( + config, num_attention_heads=config.decoder_attention_heads + ) + self.encoder_attn_text_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # cross-attention + self.encoder_attn = GroundingDinoMultiscaleDeformableAttention( + config, + num_heads=config.decoder_attention_heads, + n_points=config.decoder_n_points, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + # feedforward neural networks + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim, config.layer_norm_eps) + + def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]): + return tensor if position_embeddings is None else tensor + position_embeddings + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Optional[torch.Tensor] = None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + vision_encoder_hidden_states: Optional[torch.Tensor] = None, + vision_encoder_attention_mask: Optional[torch.Tensor] = None, + text_encoder_hidden_states: Optional[torch.Tensor] = None, + text_encoder_attention_mask: Optional[torch.Tensor] = None, + self_attn_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ): + residual = hidden_states + + # Self Attention + queries = keys = self.with_pos_embed(hidden_states, position_embeddings) + hidden_states, self_attn_weights = self.self_attn( + queries=queries, + keys=keys, + values=hidden_states, + attention_mask=self_attn_mask, + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + second_residual = hidden_states + + # Cross-Attention Text + queries = self.with_pos_embed(hidden_states, position_embeddings) + + hidden_states, text_cross_attn_weights = self.encoder_attn_text( + queries=queries, + keys=text_encoder_hidden_states, + values=text_encoder_hidden_states, + # attention_mask=text_encoder_attention_mask, # TODO fix cross-attention mask here + output_attentions=True, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = second_residual + hidden_states + hidden_states = self.encoder_attn_text_layer_norm(hidden_states) + + third_residual = hidden_states + + # Cross-Attention + cross_attn_weights = None + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + attention_mask=vision_encoder_attention_mask, + encoder_hidden_states=vision_encoder_hidden_states, + encoder_attention_mask=vision_encoder_attention_mask, + position_embeddings=position_embeddings, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + output_attentions=output_attentions, + ) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = third_residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, text_cross_attn_weights, cross_attn_weights) + + return outputs + + +class GroundingDinoContrastiveEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.max_text_len = config.max_text_len + + def forward( + self, + vision_hidden_state: torch.FloatTensor, + text_hidden_state: torch.FloatTensor, + text_token_mask: torch.BoolTensor, + ) -> torch.FloatTensor: + output = vision_hidden_state @ text_hidden_state.transpose(-1, -2) + output = output.masked_fill(~text_token_mask[:, None, :], float("-inf")) + + # padding to max_text_len + new_output = torch.full((*output.shape[:-1], self.max_text_len), float("-inf"), device=output.device) + new_output[..., : output.shape[-1]] = output + + return new_output + + +class GroundingDinoPreTrainedModel(PreTrainedModel): + config_class = GroundingDinoConfig + base_model_prefix = "model" + main_input_name = "pixel_values" + + def _init_weights(self, module): + std = self.config.init_std + + if isinstance(module, GroundingDinoLearnedPositionEmbedding): + nn.init.uniform_(module.row_embeddings.weight) + nn.init.uniform_(module.column_embeddings.weight) + elif isinstance(module, GroundingDinoMultiscaleDeformableAttention): + module._reset_parameters() + elif isinstance(module, GroundingDinoBiMultiHeadAttention): + nn.init.xavier_uniform_(module.vision_proj.weight) + module.vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.text_proj.weight) + module.text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_vision_proj.weight) + module.values_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.values_text_proj.weight) + module.values_text_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_vision_proj.weight) + module.out_vision_proj.bias.data.fill_(0) + nn.init.xavier_uniform_(module.out_text_proj.weight) + module.out_text_proj.bias.data.fill_(0) + elif isinstance(module, (GroundingDinoEncoderLayer, GroundingDinoDecoderLayer)): + for p in module.parameters(): + if p.dim() > 1: + nn.init.normal_(p, mean=0.0, std=std) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, GroundingDinoMLPPredictionHead): + nn.init.constant_(module.layers[-1].weight.data, 0) + nn.init.constant_(module.layers[-1].bias.data, 0) + + if hasattr(module, "reference_points") and not self.config.two_stage: + nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0) + nn.init.constant_(module.reference_points.bias.data, 0.0) + if hasattr(module, "level_embed"): + nn.init.normal_(module.level_embed) + + def _set_gradient_checkpointing(self, module, value=False): + if isinstance(module, GroundingDinoDecoder): + module.gradient_checkpointing = value + + +GROUNDING_DINO_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`GroundingDinoConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +GROUNDING_DINO_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. + + Pixel values can be obtained using [`AutoImageProcessor`]. See [`GroundingDinoImageProcessor.__call__`] for + details. + + input_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`GroundingDinoTokenizer.__call__`] for details. + + token_type_ids (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: 0 corresponds to a `sentence A` token, 1 corresponds to a `sentence B` token + + [What are token type IDs?](../glossary#token-type-ids) + + attention_mask (`torch.LongTensor` of shape `(batch_size, text_sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are real (i.e. **not masked**), + - 0 for tokens that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + + [What are attention masks?](../glossary#attention-mask) + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state_vision`, *optional*: `last_hidden_state_text`, *optional*: + `vision_hidden_states`, *optional*: `text_hidden_states`, *optional*: `attentions`) + `last_hidden_state_vision` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence + of hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the + decoder. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +class GroundingDinoEncoder(GroundingDinoPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a + [`GroundingDinoEncoderLayer`]. + + The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers. + + Args: + config: GroundingDinoConfig + """ + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layers = nn.ModuleList([GroundingDinoEncoderLayer(config) for _ in range(config.encoder_layers)]) + + # Initialize weights and apply final processing + self.post_init() + + @staticmethod + def get_reference_points(spatial_shapes, valid_ratios, device): + """ + Get reference points for each feature map. + + Args: + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Valid ratios of each feature map. + device (`torch.device`): + Device on which to create the tensors. + Returns: + `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)` + """ + reference_points_list = [] + for level, (height, width) in enumerate(spatial_shapes): + ref_y, ref_x = meshgrid( + torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device), + torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device), + indexing="ij", + ) + # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36 + ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height) + ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width) + ref = torch.stack((ref_x, ref_y), -1) + reference_points_list.append(ref) + reference_points = torch.cat(reference_points_list, 1) + reference_points = reference_points[:, :, None] * valid_ratios[:, None] + return reference_points + + def forward( + self, + vision_features: Tensor, + vision_attention_mask: Tensor, + vision_position_embedding: Tensor, + spatial_shapes: Tensor, + level_start_index: Tensor, + valid_ratios=None, + text_features: Optional[Tensor] = None, + text_attention_mask: Optional[Tensor] = None, + text_position_embedding: Optional[Tensor] = None, + text_self_attention_masks: Optional[Tensor] = None, + text_position_ids: Optional[Tensor] = None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + vision_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Flattened feature map (output of the backbone + projection layer) that is passed to the encoder. + vision_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 0 for pixel features that are real (i.e. **not masked**), + - 1 for pixel features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + vision_position_embedding (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of each feature map. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`): + Starting index of each feature map. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`): + Ratio of valid area in each feature level. + text_features (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Flattened text features that are passed to the encoder. + text_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + [What are attention masks?](../glossary#attention-mask) + text_position_embedding (`torch.FloatTensor` of shape `(batch_size, text_seq_len)`): + Position embeddings that are added to the queries and keys in each self-attention layer. + text_self_attention_masks (`torch.BoolTensor` of shape `(batch_size, text_seq_len, text_seq_len)`): + Masks to avoid performing attention between padding text features. Mask values selected in `[0, 1]`: + - 1 for text features that are real (i.e. **not masked**), + - 0 for text features that are padding (i.e. **masked**). + text_position_ids (`torch.LongTensor` of shape `(batch_size, num_queries)`): + Position ids for text features. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=vision_features.device) + + encoder_vision_states = () if output_hidden_states else None + encoder_text_states = () if output_hidden_states else None + all_attns = () if output_attentions else None + all_attn_fused_text = () if output_attentions else None + all_attn_fused_vision = () if output_attentions else None + all_attn_enhanced_text = () if output_attentions else None + all_attn_deformable = () if output_attentions else None + for i, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + (vision_features, text_features), attentions = encoder_layer( + vision_features=vision_features, + vision_position_embedding=vision_position_embedding, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + key_padding_mask=vision_attention_mask, + reference_points=reference_points, + text_features=text_features, + text_attention_mask=text_attention_mask, + text_position_embedding=text_position_embedding, + text_self_attention_masks=text_self_attention_masks, + text_position_ids=text_position_ids, + ) + + if output_attentions: + all_attn_fused_vision += (attentions[0],) + all_attn_fused_text += (attentions[1],) + all_attn_enhanced_text += (attentions[2],) + all_attn_deformable += (attentions[3],) + + if output_hidden_states: + encoder_vision_states += (vision_features,) + encoder_text_states += (text_features,) + + if output_attentions: + all_attns = (all_attn_fused_vision, all_attn_fused_text, all_attn_enhanced_text, all_attn_deformable) + + if not return_dict: + enc_outputs = [vision_features, text_features, encoder_vision_states, encoder_text_states, all_attns] + return tuple(v for v in enc_outputs if v is not None) + return GroundingDinoEncoderOutput( + last_hidden_state_vision=vision_features, + last_hidden_state_text=text_features, + vision_hidden_states=encoder_vision_states, + text_hidden_states=encoder_text_states, + attentions=all_attns, + ) + + +class GroundingDinoDecoder(GroundingDinoPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`GroundingDinoDecoderLayer`]. + + The decoder updates the query embeddings through multiple self-attention and cross-attention layers. + + Some tweaks for Grounding DINO: + + - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass. + - it also returns a stack of intermediate outputs and reference points from all decoding layers. + + Args: + config: GroundingDinoConfig + """ + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layer_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + self.layers = nn.ModuleList([GroundingDinoDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.reference_points_head = GroundingDinoMLPPredictionHead( + config.query_dim // 2 * config.d_model, config.d_model, config.d_model, 2 + ) + self.gradient_checkpointing = False + + # hack implementation for iterative bounding box refinement as in two-stage Deformable DETR + self.bbox_embed = None + self.class_embed = None + self.query_scale = None + + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + inputs_embeds, + vision_encoder_hidden_states, + vision_encoder_attention_mask=None, + text_encoder_hidden_states=None, + text_encoder_attention_mask=None, + reference_points=None, + spatial_shapes=None, + level_start_index=None, + valid_ratios=None, + self_attn_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`): + The query embeddings that are passed into the decoder. + vision_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Last hidden state from encoder related to vision feature map. + vision_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`: + - 1 for pixel features that are real (i.e. **not masked**), + - 0 for pixel features that are padding (i.e. **masked**). + text_encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, text_seq_len, hidden_size)`): + Last hidden state from encoder related to text features. + text_encoder_attention_mask (`torch.Tensor` of shape `(batch_size, text_seq_len)`, *optional*): + Mask to avoid performing attention on padding text features. Mask values selected in `[0, 1]`: + - 0 for text features that are real (i.e. **not masked**), + - 1 for text features that are padding (i.e. **masked**). + reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*): + Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area. + spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`): + Spatial shapes of the feature maps. + level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*): + Indexes for the start of each feature level. In range `[0, sequence_length]`. + valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*): + Ratio of valid area in each feature level. + self_attn_mask (`torch.BoolTensor` of shape `(batch_size, text_seq_len)`): + Masks to avoid performing self-attention between vision hidden state. Mask values selected in `[0, 1]`: + - 1 for queries that are real (i.e. **not masked**), + - 0 for queries that are padding (i.e. **masked**). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if inputs_embeds is not None: + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_attns = () if output_attentions else None + all_cross_attns_vision = () if (output_attentions and vision_encoder_hidden_states is not None) else None + all_cross_attns_text = () if (output_attentions and text_encoder_hidden_states is not None) else None + intermediate = () + intermediate_reference_points = () + + for idx, decoder_layer in enumerate(self.layers): + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + reference_points_input = ( + reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None] + ) + elif num_coordinates == 2: + reference_points_input = reference_points[:, :, None] * valid_ratios[:, None] + else: + raise ValueError("Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}") + query_pos = get_sine_pos_embed(reference_points_input[:, :, 0, :], num_pos_feats=self.config.d_model // 2) + query_pos = self.reference_points_head(query_pos) + + # In original implementation they apply layer norm before outputting intermediate hidden states + # Though that's not through between layers so the layers use as input the output of the previous layer + # withtout layer norm + if output_hidden_states: + all_hidden_states += (self.layer_norm(hidden_states),) + + if self.gradient_checkpointing and self.training: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs, output_attentions) + + return custom_forward + + layer_outputs = torch.utils.checkpoint.checkpoint( + create_custom_forward(decoder_layer), + hidden_states, + query_pos, + reference_points_input, + spatial_shapes, + level_start_index, + vision_encoder_hidden_states, + vision_encoder_attention_mask, + text_encoder_hidden_states, + text_encoder_attention_mask, + self_attn_mask, + None, + ) + else: + layer_outputs = decoder_layer( + hidden_states=hidden_states, + position_embeddings=query_pos, + reference_points=reference_points_input, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + vision_encoder_hidden_states=vision_encoder_hidden_states, + vision_encoder_attention_mask=vision_encoder_attention_mask, + text_encoder_hidden_states=text_encoder_hidden_states, + text_encoder_attention_mask=text_encoder_attention_mask, + self_attn_mask=self_attn_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + # hack implementation for iterative bounding box refinement + if self.bbox_embed is not None: + tmp = self.bbox_embed[idx](hidden_states) + num_coordinates = reference_points.shape[-1] + if num_coordinates == 4: + new_reference_points = tmp + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + elif num_coordinates == 2: + new_reference_points = tmp + new_reference_points[..., :2] = tmp[..., :2] + torch.special.logit(reference_points, eps=1e-5) + new_reference_points = new_reference_points.sigmoid() + else: + raise ValueError( + f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}" + ) + reference_points = new_reference_points.detach() + + intermediate += (self.layer_norm(hidden_states),) + intermediate_reference_points += (reference_points,) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if text_encoder_hidden_states is not None: + all_cross_attns_text += (layer_outputs[2],) + + if vision_encoder_hidden_states is not None: + all_cross_attns_vision += (layer_outputs[3],) + + # Keep batch_size as first dimension + intermediate = torch.stack(intermediate, dim=1) + intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1) + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if output_attentions: + all_attns += (all_self_attns, all_cross_attns_text, all_cross_attns_vision) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + intermediate, + intermediate_reference_points, + all_hidden_states, + all_attns, + ] + if v is not None + ) + return GroundingDinoDecoderOutput( + last_hidden_state=hidden_states, + intermediate_hidden_states=intermediate, + intermediate_reference_points=intermediate_reference_points, + hidden_states=all_hidden_states, + attentions=all_attns, + ) + + +# these correspond to [CLS], [SEP], . and ? +SPECIAL_TOKENS = [101, 102, 1012, 1029] + + +def generate_masks_with_special_tokens_and_transfer_map(input_ids: torch.LongTensor) -> Tuple[Tensor, Tensor]: + """Generate attention mask between each pair of special tokens and positional ids. + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. + Returns: + `tuple(torch.Tensor)` comprising attention mask between each special tokens and position_ids: + - **attention_mask** (`torch.BoolTensor` of shape `(batch_size, sequence_length, sequence_length)`) + - **position_ids** (`torch.LongTensor` of shape `(batch_size, sequence_length)`) + """ + batch_size, num_token = input_ids.shape + # special_tokens_mask: batch_size, num_token. 1 for special tokens. 0 for normal tokens + special_tokens_mask = torch.zeros((batch_size, num_token), device=input_ids.device).bool() + for special_token in SPECIAL_TOKENS: + special_tokens_mask |= input_ids == special_token + + # idxs: each row is a list of indices of special tokens + idxs = torch.nonzero(special_tokens_mask) + + # generate attention mask and positional ids + attention_mask = torch.eye(num_token, device=input_ids.device).bool().unsqueeze(0).repeat(batch_size, 1, 1) + position_ids = torch.zeros((batch_size, num_token), device=input_ids.device) + previous_col = 0 + for i in range(idxs.shape[0]): + row, col = idxs[i] + if (col == 0) or (col == num_token - 1): + attention_mask[row, col, col] = True + position_ids[row, col] = 0 + else: + attention_mask[row, previous_col + 1 : col + 1, previous_col + 1 : col + 1] = True + position_ids[row, previous_col + 1 : col + 1] = torch.arange( + 0, col - previous_col, device=input_ids.device + ) + + previous_col = col + + return attention_mask, position_ids.to(torch.long) + + +@add_start_docstrings( + """ + The bare Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) outputting raw + hidden-states without any specific head on top. + """, + GROUNDING_DINO_START_DOCSTRING, +) +class GroundingDinoModel(GroundingDinoPreTrainedModel): + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + # Create backbone + positional encoding + backbone = GroundingDinoConvEncoder(config) + position_embeddings = build_position_encoding(config) + self.backbone = GroundingDinoConvModel(backbone, position_embeddings) + + # Create input projection layers + if config.num_feature_levels > 1: + num_backbone_outs = len(backbone.intermediate_channel_sizes) + input_proj_list = [] + for i in range(num_backbone_outs): + in_channels = backbone.intermediate_channel_sizes[i] + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ) + for _ in range(config.num_feature_levels - num_backbone_outs): + input_proj_list.append( + nn.Sequential( + nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1), + nn.GroupNorm(32, config.d_model), + ) + ) + in_channels = config.d_model + self.input_proj_vision = nn.ModuleList(input_proj_list) + else: + self.input_proj_vision = nn.ModuleList( + [ + nn.Sequential( + nn.Conv2d(backbone.intermediate_channel_sizes[-1], config.d_model, kernel_size=1), + nn.GroupNorm(32, config.d_model), + ) + ] + ) + + # Create text backbone + self.text_backbone = AutoModel.from_config(config.text_config, add_pooling_layer=False) + self.text_projection = nn.Linear(config.text_config.hidden_size, config.d_model) + + if config.embedding_init_target or not config.two_stage: + self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model) + + self.encoder = GroundingDinoEncoder(config) + self.decoder = GroundingDinoDecoder(config) + + self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model)) + + if config.two_stage: + self.enc_output = nn.Linear(config.d_model, config.d_model) + self.enc_output_norm = nn.LayerNorm(config.d_model, config.layer_norm_eps) + if ( + config.two_stage_bbox_embed_share + and config.decoder_bbox_embed_share + and self.decoder.bbox_embed is not None + ): + self.encoder_output_bbox_embed = self.decoder.bbox_embed + else: + self.encoder_output_bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + + self.encoder_output_class_embed = GroundingDinoContrastiveEmbedding(config) + else: + self.reference_points = nn.Embedding(config.num_queries, 4) + + self.post_init() + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + def freeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(False) + + def unfreeze_backbone(self): + for name, param in self.backbone.conv_encoder.model.named_parameters(): + param.requires_grad_(True) + + def get_valid_ratio(self, mask): + """Get the valid ratio of all feature maps.""" + + _, height, width = mask.shape + valid_height = torch.sum(mask[:, :, 0], 1) + valid_width = torch.sum(mask[:, 0, :], 1) + valid_ratio_heigth = valid_height.float() / height + valid_ratio_width = valid_width.float() / width + valid_ratio = torch.stack([valid_ratio_width, valid_ratio_heigth], -1) + return valid_ratio + + def generate_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes): + """Generate the encoder output proposals from encoded enc_output. + + Args: + enc_output (`torch.Tensor[batch_size, sequence_length, hidden_size]`): Output of the encoder. + padding_mask (`torch.Tensor[batch_size, sequence_length]`): Padding mask for `enc_output`. + spatial_shapes (`torch.Tensor[num_feature_levels, 2]`): Spatial shapes of the feature maps. + + Returns: + `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction. + - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to + directly predict a bounding box. (without the need of a decoder) + - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse + sigmoid. + """ + batch_size = enc_output.shape[0] + proposals = [] + current_position = 0 + for level, (height, width) in enumerate(spatial_shapes): + mask_flatten_ = padding_mask[:, current_position : (current_position + height * width)] + mask_flatten_ = mask_flatten_.view(batch_size, height, width, 1) + valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1) + valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1) + + grid_y, grid_x = meshgrid( + torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device), + torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device), + indexing="ij", + ) + grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1) + + scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2) + grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale + width_heigth = torch.ones_like(grid) * 0.05 * (2.0**level) + proposal = torch.cat((grid, width_heigth), -1).view(batch_size, -1, 4) + proposals.append(proposal) + current_position += height * width + + output_proposals = torch.cat(proposals, 1) + output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True) + output_proposals = torch.log(output_proposals / (1 - output_proposals)) # inverse sigmoid + output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf")) + output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf")) + + # assign each pixel as an object query + object_query = enc_output + object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0)) + object_query = object_query.masked_fill(~output_proposals_valid, float(0)) + object_query = self.enc_output_norm(self.enc_output(object_query)) + return object_query, output_proposals + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDinoModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Tensor, + input_ids: Tensor, + token_type_ids: Optional[Tensor] = None, + attention_mask: Optional[Tensor] = None, + pixel_mask: Optional[Tensor] = None, + encoder_outputs=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, AutoModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." + + >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") + >>> model = AutoModel.from_pretrained("IDEA-Research/grounding-dino-tiny") + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 900, 256] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_self_attention_masks, position_ids = generate_masks_with_special_tokens_and_transfer_map(input_ids) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + text_token_mask = attention_mask.bool() # just to avoid renaming everywhere + + max_text_len = self.config.max_text_len + if text_self_attention_masks.shape[1] > max_text_len: + text_self_attention_masks = text_self_attention_masks[:, :max_text_len, :max_text_len] + position_ids = position_ids[:, :max_text_len] + input_ids = input_ids[:, :max_text_len] + token_type_ids = token_type_ids[:, :max_text_len] + text_token_mask = text_token_mask[:, :max_text_len] + + # Extract text features from text backbone + text_outputs = self.text_backbone( + input_ids, text_self_attention_masks, token_type_ids, position_ids, return_dict=return_dict + ) + text_features = text_outputs.last_hidden_state if return_dict else text_outputs[0] + text_features = self.text_projection(text_features) + + batch_size, num_channels, height, width = pixel_values.shape + device = pixel_values.device + + if pixel_mask is None: + pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device) + + # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper) + # First, sent pixel_values + pixel_mask through Backbone to obtain the features + # which is a list of tuples + vision_features, position_embeddings_list = self.backbone(pixel_values, pixel_mask) + + # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default) + feature_maps = [] + masks = [] + for level, (source, mask) in enumerate(vision_features): + feature_maps.append(self.input_proj_vision[level](source)) + masks.append(mask) + + # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage + if self.config.num_feature_levels > len(feature_maps): + _len_sources = len(feature_maps) + for level in range(_len_sources, self.config.num_feature_levels): + if level == _len_sources: + source = self.input_proj_vision[level](vision_features[-1][0]) + else: + source = self.input_proj_vision[level](feature_maps[-1]) + mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0] + pos_l = self.backbone.position_embedding(source, mask).to(source.dtype) + feature_maps.append(source) + masks.append(mask) + position_embeddings_list.append(pos_l) + + # Create queries + query_embeds = None + if self.config.embedding_init_target or self.config.two_stage: + query_embeds = self.query_position_embeddings.weight + + # Prepare encoder inputs (by flattening) + source_flatten = [] + mask_flatten = [] + lvl_pos_embed_flatten = [] + spatial_shapes = [] + for level, (source, mask, pos_embed) in enumerate(zip(feature_maps, masks, position_embeddings_list)): + batch_size, num_channels, height, width = source.shape + spatial_shape = (height, width) + spatial_shapes.append(spatial_shape) + source = source.flatten(2).transpose(1, 2) + mask = mask.flatten(1) + pos_embed = pos_embed.flatten(2).transpose(1, 2) + lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1) + lvl_pos_embed_flatten.append(lvl_pos_embed) + source_flatten.append(source) + mask_flatten.append(mask) + source_flatten = torch.cat(source_flatten, 1) + mask_flatten = torch.cat(mask_flatten, 1) + lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1) + spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device) + level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1])) + valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1) + valid_ratios = valid_ratios.float() + + # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder + # Also provide spatial_shapes, level_start_index and valid_ratios + if encoder_outputs is None: + encoder_outputs = self.encoder( + vision_features=source_flatten, + vision_attention_mask=~mask_flatten, + vision_position_embedding=lvl_pos_embed_flatten, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + text_features=text_features, + text_attention_mask=~text_token_mask, + text_position_embedding=None, + text_self_attention_masks=~text_self_attention_masks, + text_position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a GroundingDinoEncoderOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, GroundingDinoEncoderOutput): + encoder_outputs = GroundingDinoEncoderOutput( + last_hidden_state_vision=encoder_outputs[0], + last_hidden_state_text=encoder_outputs[1], + vision_hidden_states=encoder_outputs[2] if output_hidden_states else None, + text_hidden_states=encoder_outputs[3] if output_hidden_states else None, + attentions=encoder_outputs[-1] if output_attentions else None, + ) + + # Fifth, prepare decoder inputs + enc_outputs_class = None + enc_outputs_coord_logits = None + if self.config.two_stage: + object_query_embedding, output_proposals = self.generate_encoder_output_proposals( + encoder_outputs[0], ~mask_flatten, spatial_shapes + ) + + # hack implementation as in two-stage Deformable DETR + # apply a detection head to each pixel (A.4 in paper) + # linear projection for bounding box binary classification (i.e. foreground and background) + enc_outputs_class = self.encoder_output_class_embed( + object_query_embedding, encoder_outputs[1], text_token_mask + ) + # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch) + delta_bbox = self.encoder_output_bbox_embed(object_query_embedding) + enc_outputs_coord_logits = delta_bbox + output_proposals + + # only keep top scoring `config.num_queries` proposals + topk = self.config.num_queries + topk_logits = enc_outputs_class.max(-1)[0] + topk_proposals = torch.topk(topk_logits, topk, dim=1)[1] + topk_coords_logits = torch.gather( + enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4) + ) + + topk_coords_logits = topk_coords_logits.detach() + reference_points = topk_coords_logits.sigmoid() + init_reference_points = reference_points + if query_embeds is not None: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + else: + target = torch.gather( + object_query_embedding, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, self.d_model) + ).detach() + else: + target = query_embeds.unsqueeze(0).repeat(batch_size, 1, 1) + reference_points = self.reference_points.weight.unsqueeze(0).repeat(batch_size, 1, 1).sigmoid() + init_reference_points = reference_points + + decoder_outputs = self.decoder( + inputs_embeds=target, + vision_encoder_hidden_states=encoder_outputs[0], + vision_encoder_attention_mask=mask_flatten, + text_encoder_hidden_states=encoder_outputs[1], + text_encoder_attention_mask=~text_token_mask, + reference_points=reference_points, + spatial_shapes=spatial_shapes, + level_start_index=level_start_index, + valid_ratios=valid_ratios, + self_attn_mask=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None) + tuple_outputs = ( + (decoder_outputs[0], init_reference_points) + decoder_outputs[1:] + encoder_outputs + enc_outputs + ) + + return tuple_outputs + + return GroundingDinoModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + init_reference_points=init_reference_points, + intermediate_hidden_states=decoder_outputs.intermediate_hidden_states, + intermediate_reference_points=decoder_outputs.intermediate_reference_points, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + encoder_last_hidden_state_vision=encoder_outputs.last_hidden_state_vision, + encoder_last_hidden_state_text=encoder_outputs.last_hidden_state_text, + encoder_vision_hidden_states=encoder_outputs.vision_hidden_states, + encoder_text_hidden_states=encoder_outputs.text_hidden_states, + encoder_attentions=encoder_outputs.attentions, + enc_outputs_class=enc_outputs_class, + enc_outputs_coord_logits=enc_outputs_coord_logits, + ) + + +# Copied from transformers.models.detr.modeling_detr.DetrMLPPredictionHead +class GroundingDinoMLPPredictionHead(nn.Module): + """ + Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates, + height and width of a bounding box w.r.t. an image. + + Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py + + """ + + def __init__(self, input_dim, hidden_dim, output_dim, num_layers): + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + return x + + +# Copied from transformers.models.detr.modeling_detr._upcast +def _upcast(t: Tensor) -> Tensor: + # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type + if t.is_floating_point(): + return t if t.dtype in (torch.float32, torch.float64) else t.float() + else: + return t if t.dtype in (torch.int32, torch.int64) else t.int() + + +# Copied from transformers.models.detr.modeling_detr.box_area +def box_area(boxes: Tensor) -> Tensor: + """ + Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. + + Args: + boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): + Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 + < x2` and `0 <= y1 < y2`. + + Returns: + `torch.FloatTensor`: a tensor containing the area for each box. + """ + boxes = _upcast(boxes) + return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) + + +# Copied from transformers.models.detr.modeling_detr.box_iou +def box_iou(boxes1, boxes2): + area1 = box_area(boxes1) + area2 = box_area(boxes2) + + left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] + right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] + + width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] + inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] + + union = area1[:, None] + area2 - inter + + iou = inter / union + return iou, union + + +# Copied from transformers.models.detr.modeling_detr.generalized_box_iou +def generalized_box_iou(boxes1, boxes2): + """ + Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. + + Returns: + `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) + """ + # degenerate boxes gives inf / nan results + # so do an early check + if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): + raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") + if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): + raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") + iou, union = box_iou(boxes1, boxes2) + + top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) + bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) + + width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] + area = width_height[:, :, 0] * width_height[:, :, 1] + + return iou - (area - union) / area + + +# Copied from transformers.models.detr.modeling_detr._max_by_axis +def _max_by_axis(the_list): + # type: (List[List[int]]) -> List[int] + maxes = the_list[0] + for sublist in the_list[1:]: + for index, item in enumerate(sublist): + maxes[index] = max(maxes[index], item) + return maxes + + +# Copied from transformers.models.detr.modeling_detr.dice_loss +def dice_loss(inputs, targets, num_boxes): + """ + Compute the DICE loss, similar to generalized IOU for masks + + Args: + inputs: A float tensor of arbitrary shape. + The predictions for each example. + targets: A float tensor with the same shape as inputs. Stores the binary + classification label for each element in inputs (0 for the negative class and 1 for the positive + class). + """ + inputs = inputs.sigmoid() + inputs = inputs.flatten(1) + numerator = 2 * (inputs * targets).sum(1) + denominator = inputs.sum(-1) + targets.sum(-1) + loss = 1 - (numerator + 1) / (denominator + 1) + return loss.sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.sigmoid_focal_loss +def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2): + """ + Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002. + + Args: + inputs (`torch.FloatTensor` of arbitrary shape): + The predictions for each example. + targets (`torch.FloatTensor` with the same shape as `inputs`) + A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class + and 1 for the positive class). + alpha (`float`, *optional*, defaults to `0.25`): + Optional weighting factor in the range (0,1) to balance positive vs. negative examples. + gamma (`int`, *optional*, defaults to `2`): + Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples. + + Returns: + Loss tensor + """ + prob = inputs.sigmoid() + ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none") + # add modulating factor + p_t = prob * targets + (1 - prob) * (1 - targets) + loss = ce_loss * ((1 - p_t) ** gamma) + + if alpha >= 0: + alpha_t = alpha * targets + (1 - alpha) * (1 - targets) + loss = alpha_t * loss + + return loss.mean(1).sum() / num_boxes + + +# Copied from transformers.models.detr.modeling_detr.NestedTensor +class NestedTensor(object): + def __init__(self, tensors, mask: Optional[Tensor]): + self.tensors = tensors + self.mask = mask + + def to(self, device): + cast_tensor = self.tensors.to(device) + mask = self.mask + if mask is not None: + cast_mask = mask.to(device) + else: + cast_mask = None + return NestedTensor(cast_tensor, cast_mask) + + def decompose(self): + return self.tensors, self.mask + + def __repr__(self): + return str(self.tensors) + + +# Copied from transformers.models.detr.modeling_detr.nested_tensor_from_tensor_list +def nested_tensor_from_tensor_list(tensor_list: List[Tensor]): + if tensor_list[0].ndim == 3: + max_size = _max_by_axis([list(img.shape) for img in tensor_list]) + batch_shape = [len(tensor_list)] + max_size + batch_size, num_channels, height, width = batch_shape + dtype = tensor_list[0].dtype + device = tensor_list[0].device + tensor = torch.zeros(batch_shape, dtype=dtype, device=device) + mask = torch.ones((batch_size, height, width), dtype=torch.bool, device=device) + for img, pad_img, m in zip(tensor_list, tensor, mask): + pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) + m[: img.shape[1], : img.shape[2]] = False + else: + raise ValueError("Only 3-dimensional tensors are supported") + return NestedTensor(tensor, mask) + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrHungarianMatcher with DeformableDetr->GroundingDino +class GroundingDinoHungarianMatcher(nn.Module): + """ + This class computes an assignment between the targets and the predictions of the network. + + For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more + predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are + un-matched (and thus treated as non-objects). + + Args: + class_cost: + The relative weight of the classification error in the matching cost. + bbox_cost: + The relative weight of the L1 error of the bounding box coordinates in the matching cost. + giou_cost: + The relative weight of the giou loss of the bounding box in the matching cost. + """ + + def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1): + super().__init__() + requires_backends(self, ["scipy"]) + + self.class_cost = class_cost + self.bbox_cost = bbox_cost + self.giou_cost = giou_cost + if class_cost == 0 and bbox_cost == 0 and giou_cost == 0: + raise ValueError("All costs of the Matcher can't be 0") + + @torch.no_grad() + def forward(self, outputs, targets): + """ + Args: + outputs (`dict`): + A dictionary that contains at least these entries: + * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits + * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates. + targets (`List[dict]`): + A list of targets (len(targets) = batch_size), where each target is a dict containing: + * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of + ground-truth + objects in the target) containing the class labels + * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates. + + Returns: + `List[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where: + - index_i is the indices of the selected predictions (in order) + - index_j is the indices of the corresponding selected targets (in order) + For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes) + """ + batch_size, num_queries = outputs["logits"].shape[:2] + + # We flatten to compute the cost matrices in a batch + out_prob = outputs["logits"].flatten(0, 1).sigmoid() # [batch_size * num_queries, num_classes] + out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] + + # Also concat the target labels and boxes + target_ids = torch.cat([v["class_labels"] for v in targets]) + target_bbox = torch.cat([v["boxes"] for v in targets]) + + # Compute the classification cost. + alpha = 0.25 + gamma = 2.0 + neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log()) + pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) + class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids] + + # Compute the L1 cost between boxes + bbox_cost = torch.cdist(out_bbox, target_bbox, p=1) + + # Compute the giou cost between boxes + giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox)) + + # Final cost matrix + cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost + cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu() + + sizes = [len(v["boxes"]) for v in targets] + indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))] + return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] + + +# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrLoss with DeformableDetr->GroundingDino +class GroundingDinoLoss(nn.Module): + """ + This class computes the losses for `GroundingDinoForObjectDetection`. The process happens in two steps: 1) we + compute hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of + matched ground-truth / prediction (supervise class and box). + + Args: + matcher (`GroundingDinoHungarianMatcher`): + Module able to compute a matching between targets and proposals. + num_classes (`int`): + Number of object categories, omitting the special no-object category. + focal_alpha (`float`): + Alpha parameter in focal loss. + losses (`List[str]`): + List of all the losses to be applied. See `get_loss` for a list of all available losses. + """ + + def __init__(self, matcher, num_classes, focal_alpha, losses): + super().__init__() + self.matcher = matcher + self.num_classes = num_classes + self.focal_alpha = focal_alpha + self.losses = losses + + # removed logging parameter, which was part of the original implementation + def loss_labels(self, outputs, targets, indices, num_boxes): + """ + Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor + of dim [nb_target_boxes] + """ + if "logits" not in outputs: + raise KeyError("No logits were found in the outputs") + source_logits = outputs["logits"] + + idx = self._get_source_permutation_idx(indices) + target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)]) + target_classes = torch.full( + source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device + ) + target_classes[idx] = target_classes_o + + target_classes_onehot = torch.zeros( + [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1], + dtype=source_logits.dtype, + layout=source_logits.layout, + device=source_logits.device, + ) + target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1) + + target_classes_onehot = target_classes_onehot[:, :, :-1] + loss_ce = ( + sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2) + * source_logits.shape[1] + ) + losses = {"loss_ce": loss_ce} + + return losses + + @torch.no_grad() + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_cardinality + def loss_cardinality(self, outputs, targets, indices, num_boxes): + """ + Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes. + + This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients. + """ + logits = outputs["logits"] + device = logits.device + target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device) + # Count the number of predictions that are NOT "no-object" (which is the last class) + card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1) + card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float()) + losses = {"cardinality_error": card_err} + return losses + + # Copied from transformers.models.detr.modeling_detr.DetrLoss.loss_boxes + def loss_boxes(self, outputs, targets, indices, num_boxes): + """ + Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss. + + Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes + are expected in format (center_x, center_y, w, h), normalized by the image size. + """ + if "pred_boxes" not in outputs: + raise KeyError("No predicted boxes found in outputs") + idx = self._get_source_permutation_idx(indices) + source_boxes = outputs["pred_boxes"][idx] + target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0) + + loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none") + + losses = {} + losses["loss_bbox"] = loss_bbox.sum() / num_boxes + + loss_giou = 1 - torch.diag( + generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes)) + ) + losses["loss_giou"] = loss_giou.sum() / num_boxes + return losses + + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_source_permutation_idx + def _get_source_permutation_idx(self, indices): + # permute predictions following indices + batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)]) + source_idx = torch.cat([source for (source, _) in indices]) + return batch_idx, source_idx + + # Copied from transformers.models.detr.modeling_detr.DetrLoss._get_target_permutation_idx + def _get_target_permutation_idx(self, indices): + # permute targets following indices + batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)]) + target_idx = torch.cat([target for (_, target) in indices]) + return batch_idx, target_idx + + def get_loss(self, loss, outputs, targets, indices, num_boxes): + loss_map = { + "labels": self.loss_labels, + "cardinality": self.loss_cardinality, + "boxes": self.loss_boxes, + } + if loss not in loss_map: + raise ValueError(f"Loss {loss} not supported") + return loss_map[loss](outputs, targets, indices, num_boxes) + + def forward(self, outputs, targets): + """ + This performs the loss computation. + + Args: + outputs (`dict`, *optional*): + Dictionary of tensors, see the output specification of the model for the format. + targets (`List[dict]`, *optional*): + List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the + losses applied, see each loss' doc. + """ + outputs_without_aux = {k: v for k, v in outputs.items() if k != "auxiliary_outputs" and k != "enc_outputs"} + + # Retrieve the matching between the outputs of the last layer and the targets + indices = self.matcher(outputs_without_aux, targets) + + # Compute the average number of target boxes accross all nodes, for normalization purposes + num_boxes = sum(len(t["class_labels"]) for t in targets) + num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device) + world_size = 1 + if is_accelerate_available(): + if PartialState._shared_state != {}: + num_boxes = reduce(num_boxes) + world_size = PartialState().num_processes + num_boxes = torch.clamp(num_boxes / world_size, min=1).item() + + # Compute all the requested losses + losses = {} + for loss in self.losses: + losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes)) + + # In case of auxiliary losses, we repeat this process with the output of each intermediate layer. + if "auxiliary_outputs" in outputs: + for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]): + indices = self.matcher(auxiliary_outputs, targets) + for loss in self.losses: + l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes) + l_dict = {k + f"_{i}": v for k, v in l_dict.items()} + losses.update(l_dict) + + if "enc_outputs" in outputs: + enc_outputs = outputs["enc_outputs"] + bin_targets = copy.deepcopy(targets) + for bt in bin_targets: + bt["class_labels"] = torch.zeros_like(bt["class_labels"]) + indices = self.matcher(enc_outputs, bin_targets) + for loss in self.losses: + l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes) + l_dict = {k + "_enc": v for k, v in l_dict.items()} + losses.update(l_dict) + + return losses + + +@add_start_docstrings( + """ + Grounding DINO Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, + for tasks such as COCO detection. + """, + GROUNDING_DINO_START_DOCSTRING, +) +class GroundingDinoForObjectDetection(GroundingDinoPreTrainedModel): + # When using clones, all layers > 0 will be clones, but layer 0 *is* required + # the bbox_embed in the decoder are all clones though + _tied_weights_keys = [r"bbox_embed\.[1-9]\d*", r"model\.decoder\.bbox_embed\.[0-9]\d*"] + + def __init__(self, config: GroundingDinoConfig): + super().__init__(config) + + self.model = GroundingDinoModel(config) + _class_embed = GroundingDinoContrastiveEmbedding(config) + + if config.decoder_bbox_embed_share: + _bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) + else: + for _ in range(config.decoder_layers): + _bbox_embed = GroundingDinoMLPPredictionHead( + input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3 + ) + self.bbox_embed = nn.ModuleList([_bbox_embed for _ in range(config.decoder_layers)]) + self.class_embed = nn.ModuleList([_class_embed for _ in range(config.decoder_layers)]) + # hack for box-refinement + self.model.decoder.bbox_embed = self.bbox_embed + # hack implementation for two-stage + self.model.decoder.class_embed = self.class_embed + + # Initialize weights and apply final processing + self.post_init() + + # taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py + @torch.jit.unused + def _set_aux_loss(self, outputs_class, outputs_coord): + # this is a workaround to make torchscript happy, as torchscript + # doesn't support dictionary with non-homogeneous values, such + # as a dict having both a Tensor and a list. + return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class[:-1], outputs_coord[:-1])] + + @add_start_docstrings_to_model_forward(GROUNDING_DINO_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=GroundingDinoObjectDetectionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + token_type_ids: torch.LongTensor = None, + attention_mask: torch.LongTensor = None, + pixel_mask: Optional[torch.BoolTensor] = None, + encoder_outputs: Optional[Union[GroundingDinoEncoderOutput, Tuple]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: List[Dict[str, Union[torch.LongTensor, torch.FloatTensor]]] = None, + ): + r""" + labels (`List[Dict]` of len `(batch_size,)`, *optional*): + Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the + following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch + respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes + in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`. + + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, GroundingDinoForObjectDetection + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "a cat." + + >>> processor = AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") + >>> model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny") + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> # convert outputs (bounding boxes and class logits) to COCO API + >>> target_sizes = torch.tensor([image.size[::-1]]) + >>> results = processor.image_processor.post_process_object_detection( + ... outputs, threshold=0.35, target_sizes=target_sizes + ... )[0] + >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]): + ... box = [round(i, 2) for i in box.tolist()] + ... print(f"Detected {label.item()} with confidence " f"{round(score.item(), 3)} at location {box}") + Detected 1 with confidence 0.453 at location [344.82, 23.18, 637.4, 373.83] + Detected 1 with confidence 0.408 at location [11.92, 51.58, 316.57, 472.89] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # First, sent images through Grounding DINO base model to obtain encoder + decoder outputs + outputs = self.model( + pixel_values=pixel_values, + input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + pixel_mask=pixel_mask, + encoder_outputs=encoder_outputs, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + idx = 5 + (1 if output_attentions else 0) + (1 if output_hidden_states else 0) + enc_text_hidden_state = outputs.encoder_last_hidden_state_text if return_dict else outputs[idx] + hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2] + init_reference_points = outputs.init_reference_points if return_dict else outputs[1] + inter_references_points = outputs.intermediate_reference_points if return_dict else outputs[3] + + # class logits + predicted bounding boxes + outputs_classes = [] + outputs_coords = [] + + # hidden_states are of shape (batch_size, num_stages, height, width) + # predict class and bounding box deltas for each stage + num_levels = hidden_states.shape[1] + for level in range(num_levels): + if level == 0: + reference = init_reference_points + else: + reference = inter_references_points[:, level - 1] + reference = torch.special.logit(reference, eps=1e-5) + outputs_class = self.class_embed[level]( + vision_hidden_state=hidden_states[:, level], + text_hidden_state=enc_text_hidden_state, + text_token_mask=attention_mask.bool(), + ) + delta_bbox = self.bbox_embed[level](hidden_states[:, level]) + + reference_coordinates = reference.shape[-1] + if reference_coordinates == 4: + outputs_coord_logits = delta_bbox + reference + elif reference_coordinates == 2: + delta_bbox[..., :2] += reference + outputs_coord_logits = delta_bbox + else: + raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}") + outputs_coord = outputs_coord_logits.sigmoid() + outputs_classes.append(outputs_class) + outputs_coords.append(outputs_coord) + outputs_class = torch.stack(outputs_classes) + outputs_coord = torch.stack(outputs_coords) + + logits = outputs_class[-1] + pred_boxes = outputs_coord[-1] + + loss, loss_dict, auxiliary_outputs = None, None, None + if labels is not None: + # First: create the matcher + matcher = GroundingDinoHungarianMatcher( + class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost + ) + # Second: create the criterion + losses = ["labels", "boxes", "cardinality"] + criterion = GroundingDinoLoss( + matcher=matcher, + num_classes=self.config.num_labels, + focal_alpha=self.config.focal_alpha, + losses=losses, + ) + criterion.to(self.device) + # Third: compute the losses, based on outputs and labels + outputs_loss = {} + outputs_loss["logits"] = logits + outputs_loss["pred_boxes"] = pred_boxes + if self.config.auxiliary_loss: + auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord) + outputs_loss["auxiliary_outputs"] = auxiliary_outputs + if self.config.two_stage: + enc_outputs_coord = outputs[-1].sigmoid() + outputs_loss["enc_outputs"] = {"logits": outputs[-2], "pred_boxes": enc_outputs_coord} + + loss_dict = criterion(outputs_loss, labels) + # Fourth: compute total loss, as a weighted sum of the various losses + weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient} + weight_dict["loss_giou"] = self.config.giou_loss_coefficient + if self.config.auxiliary_loss: + aux_weight_dict = {} + for i in range(self.config.decoder_layers - 1): + aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()}) + weight_dict.update(aux_weight_dict) + loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) + + if not return_dict: + if auxiliary_outputs is not None: + output = (logits, pred_boxes) + auxiliary_outputs + outputs + else: + output = (logits, pred_boxes) + outputs + tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output + + return tuple_outputs + + dict_outputs = GroundingDinoObjectDetectionOutput( + loss=loss, + loss_dict=loss_dict, + logits=logits, + pred_boxes=pred_boxes, + last_hidden_state=outputs.last_hidden_state, + auxiliary_outputs=auxiliary_outputs, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + encoder_last_hidden_state_vision=outputs.encoder_last_hidden_state_vision, + encoder_last_hidden_state_text=outputs.encoder_last_hidden_state_text, + encoder_vision_hidden_states=outputs.encoder_vision_hidden_states, + encoder_text_hidden_states=outputs.encoder_text_hidden_states, + encoder_attentions=outputs.encoder_attentions, + intermediate_hidden_states=outputs.intermediate_hidden_states, + intermediate_reference_points=outputs.intermediate_reference_points, + init_reference_points=outputs.init_reference_points, + enc_outputs_class=outputs.enc_outputs_class, + enc_outputs_coord_logits=outputs.enc_outputs_coord_logits, + ) + + return dict_outputs diff --git a/src/transformers/models/grounding_dino/processing_grounding_dino.py b/src/transformers/models/grounding_dino/processing_grounding_dino.py new file mode 100644 index 0000000000..44b99811d9 --- /dev/null +++ b/src/transformers/models/grounding_dino/processing_grounding_dino.py @@ -0,0 +1,228 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Grounding DINO. +""" + +from typing import List, Optional, Tuple, Union + +from ...image_processing_utils import BatchFeature +from ...image_transforms import center_to_corners_format +from ...image_utils import ImageInput +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType, is_torch_available + + +if is_torch_available(): + import torch + + +def get_phrases_from_posmap(posmaps, input_ids): + """Get token ids of phrases from posmaps and input_ids. + + Args: + posmaps (`torch.BoolTensor` of shape `(num_boxes, hidden_size)`): + A boolean tensor of text-thresholded logits related to the detected bounding boxes. + input_ids (`torch.LongTensor`) of shape `(sequence_length, )`): + A tensor of token ids. + """ + left_idx = 0 + right_idx = posmaps.shape[-1] - 1 + + # Avoiding altering the input tensor + posmaps = posmaps.clone() + + posmaps[:, 0 : left_idx + 1] = False + posmaps[:, right_idx:] = False + + token_ids = [] + for posmap in posmaps: + non_zero_idx = posmap.nonzero(as_tuple=True)[0].tolist() + token_ids.append([input_ids[i] for i in non_zero_idx]) + + return token_ids + + +class GroundingDinoProcessor(ProcessorMixin): + r""" + Constructs a Grounding DINO processor which wraps a Deformable DETR image processor and a BERT tokenizer into a + single processor. + + [`GroundingDinoProcessor`] offers all the functionalities of [`GroundingDinoImageProcessor`] and + [`AutoTokenizer`]. See the docstring of [`~GroundingDinoProcessor.__call__`] and [`~GroundingDinoProcessor.decode`] + for more information. + + Args: + image_processor (`GroundingDinoImageProcessor`): + An instance of [`GroundingDinoImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "GroundingDinoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_token_type_ids: bool = True, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchEncoding: + """ + This method uses [`GroundingDinoImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + # Get only text + if images is not None: + encoding_image_processor = self.image_processor(images, return_tensors=return_tensors) + else: + encoding_image_processor = BatchFeature() + + if text is not None: + text_encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_token_type_ids=return_token_type_ids, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + else: + text_encoding = BatchEncoding() + + text_encoding.update(encoding_image_processor) + + return text_encoding + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + def post_process_grounded_object_detection( + self, + outputs, + input_ids, + box_threshold: float = 0.25, + text_threshold: float = 0.25, + target_sizes: Union[TensorType, List[Tuple]] = None, + ): + """ + Converts the raw output of [`GroundingDinoForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y, + bottom_right_x, bottom_right_y) format and get the associated text label. + + Args: + outputs ([`GroundingDinoObjectDetectionOutput`]): + Raw outputs of the model. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + The token ids of the input text. + box_threshold (`float`, *optional*, defaults to 0.25): + Score threshold to keep object detection predictions. + text_threshold (`float`, *optional*, defaults to 0.25): + Score threshold to keep text detection predictions. + target_sizes (`torch.Tensor` or `List[Tuple[int, int]]`, *optional*): + Tensor of shape `(batch_size, 2)` or list of tuples (`Tuple[int, int]`) containing the target size + `(height, width)` of each image in the batch. If unset, predictions will not be resized. + Returns: + `List[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image + in the batch as predicted by the model. + """ + logits, boxes = outputs.logits, outputs.pred_boxes + + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + probs = torch.sigmoid(logits) # (batch_size, num_queries, 256) + scores = torch.max(probs, dim=-1)[0] # (batch_size, num_queries) + + # Convert to [x0, y0, x1, y1] format + boxes = center_to_corners_format(boxes) + + # Convert from relative [0, 1] to absolute [0, height] coordinates + if target_sizes is not None: + if isinstance(target_sizes, List): + img_h = torch.Tensor([i[0] for i in target_sizes]) + img_w = torch.Tensor([i[1] for i in target_sizes]) + else: + img_h, img_w = target_sizes.unbind(1) + + scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device) + boxes = boxes * scale_fct[:, None, :] + + results = [] + for idx, (s, b, p) in enumerate(zip(scores, boxes, probs)): + score = s[s > box_threshold] + box = b[s > box_threshold] + prob = p[s > box_threshold] + label_ids = get_phrases_from_posmap(prob > text_threshold, input_ids[idx]) + label = self.batch_decode(label_ids) + results.append({"scores": score, "labels": label, "boxes": box}) + + return results diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 1c04fb9082..15dc4f9bc2 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4236,6 +4236,30 @@ class GraphormerPreTrainedModel(metaclass=DummyObject): requires_backends(self, ["torch"]) +GROUNDING_DINO_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class GroundingDinoForObjectDetection(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDinoModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class GroundingDinoPreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + GROUPVIT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index d7a629a1b2..80b418adc1 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -247,6 +247,13 @@ class GLPNImageProcessor(metaclass=DummyObject): requires_backends(self, ["vision"]) +class GroundingDinoImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class IdeficsImageProcessor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/grounding_dino/__init__.py b/tests/models/grounding_dino/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/grounding_dino/test_image_processing_grounding_dino.py b/tests/models/grounding_dino/test_image_processing_grounding_dino.py new file mode 100644 index 0000000000..df69784bbb --- /dev/null +++ b/tests/models/grounding_dino/test_image_processing_grounding_dino.py @@ -0,0 +1,530 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import json +import pathlib +import unittest + +from transformers.testing_utils import require_torch, require_vision, slow +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import AnnotationFormatTestMixin, ImageProcessingTestMixin, prepare_image_inputs + + +if is_torch_available(): + import torch + + from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput + +if is_vision_available(): + from PIL import Image + + from transformers import GroundingDinoImageProcessor + + +class GroundingDinoImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_rescale=True, + rescale_factor=1 / 255, + do_pad=True, + ): + # by setting size["longest_edge"] > max_resolution we're effectively not testing this :p + size = size if size is not None else {"shortest_edge": 18, "longest_edge": 1333} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_pad = do_pad + self.num_queries = 5 + self.embed_dim = 5 + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_processor_dict with DeformableDetr->GroundingDino + def prepare_image_processor_dict(self): + return { + "do_resize": self.do_resize, + "size": self.size, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_pad": self.do_pad, + } + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.get_expected_values with DeformableDetr->GroundingDino + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to GroundingDinoImageProcessor, + assuming do_resize is set to True with a scalar size. + """ + if not batched: + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + if w < h: + expected_height = int(self.size["shortest_edge"] * h / w) + expected_width = self.size["shortest_edge"] + elif w > h: + expected_height = self.size["shortest_edge"] + expected_width = int(self.size["shortest_edge"] * w / h) + else: + expected_height = self.size["shortest_edge"] + expected_width = self.size["shortest_edge"] + + else: + expected_values = [] + for image in image_inputs: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.expected_output_image_shape with DeformableDetr->GroundingDino + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + return self.num_channels, height, width + + def get_fake_grounding_dino_output(self): + torch.manual_seed(42) + return GroundingDinoObjectDetectionOutput( + pred_boxes=torch.rand(self.batch_size, self.num_queries, 4), + logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim), + ) + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTester.prepare_image_inputs with DeformableDetr->GroundingDino + def prepare_image_inputs(self, equal_resolution=False, numpify=False, torchify=False): + return prepare_image_inputs( + batch_size=self.batch_size, + num_channels=self.num_channels, + min_resolution=self.min_resolution, + max_resolution=self.max_resolution, + equal_resolution=equal_resolution, + numpify=numpify, + torchify=torchify, + ) + + +@require_torch +@require_vision +class GroundingDinoImageProcessingTest(AnnotationFormatTestMixin, ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = GroundingDinoImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = GroundingDinoImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_properties with DeformableDetr->GroundingDino + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "size")) + + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_image_processor_from_dict_with_kwargs with DeformableDetr->GroundingDino + def test_image_processor_from_dict_with_kwargs(self): + image_processor = self.image_processing_class.from_dict(self.image_processor_dict) + self.assertEqual(image_processor.size, {"shortest_edge": 18, "longest_edge": 1333}) + self.assertEqual(image_processor.do_pad, True) + + image_processor = self.image_processing_class.from_dict( + self.image_processor_dict, size=42, max_size=84, pad_and_return_pixel_mask=False + ) + self.assertEqual(image_processor.size, {"shortest_edge": 42, "longest_edge": 84}) + self.assertEqual(image_processor.do_pad, False) + + def test_post_process_object_detection(self): + image_processor = self.image_processing_class(**self.image_processor_dict) + outputs = self.image_processor_tester.get_fake_grounding_dino_output() + results = image_processor.post_process_object_detection(outputs, threshold=0.0) + + self.assertEqual(len(results), self.image_processor_tester.batch_size) + self.assertEqual(list(results[0].keys()), ["scores", "labels", "boxes"]) + self.assertEqual(results[0]["boxes"].shape, (self.image_processor_tester.num_queries, 4)) + self.assertEqual(results[0]["scores"].shape, (self.image_processor_tester.num_queries,)) + + expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220]) + self.assertTrue(torch.allclose(results[0]["scores"], expected_scores, atol=1e-4)) + + expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947]) + self.assertTrue(torch.allclose(results[0]["boxes"][0], expected_box_slice, atol=1e-4)) + + @slow + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_detection_annotations with DeformableDetr->GroundingDino + def test_call_pytorch_with_coco_detection_annotations(self): + # prepare image and target + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"image_id": 39769, "annotations": target} + + # encode them + image_processing = GroundingDinoImageProcessor() + encoding = image_processing(images=image, annotations=target, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([5887.9600, 11250.2061, 489353.8438, 837122.7500, 147967.5156, 165732.3438]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.5503, 0.2765, 0.0604, 0.2215]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([75, 75, 63, 65, 17, 17]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_detection_annotations with Detr->GroundingDino + def test_batched_coco_detection_annotations(self): + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_annotations.txt", "r") as f: + target = json.loads(f.read()) + + annotations_0 = {"image_id": 39769, "annotations": target} + annotations_1 = {"image_id": 39769, "annotations": target} + + # Adjust the bounding boxes for the resized image + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotations_1["annotations"])): + coords = annotations_1["annotations"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotations_1["annotations"][i]["bbox"] = new_bbox + + images = [image_0, image_1] + annotations = [annotations_0, annotations_1] + + image_processing = GroundingDinoImageProcessor() + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + return_tensors="pt", # do_convert_annotations=True + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.6879, 0.4609, 0.0755, 0.3691], + [0.2118, 0.3359, 0.2601, 0.1566], + [0.5011, 0.5000, 0.9979, 1.0000], + [0.5010, 0.5020, 0.9979, 0.9959], + [0.3284, 0.5944, 0.5884, 0.8112], + [0.8394, 0.5445, 0.3213, 0.9110], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.4130, 0.2765, 0.0453, 0.2215], + [0.1272, 0.2016, 0.1561, 0.0940], + [0.3757, 0.4933, 0.7488, 0.9865], + [0.3759, 0.5002, 0.7492, 0.9955], + [0.1971, 0.5456, 0.3532, 0.8646], + [0.5790, 0.4115, 0.3430, 0.7161], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) + + @slow + # Copied from tests.models.deformable_detr.test_image_processing_deformable_detr.DeformableDetrImageProcessingTest.test_call_pytorch_with_coco_panoptic_annotations with DeformableDetr->GroundingDino + def test_call_pytorch_with_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) + + target = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + # encode them + image_processing = GroundingDinoImageProcessor(format="coco_panoptic") + encoding = image_processing(images=image, annotations=target, masks_path=masks_path, return_tensors="pt") + + # verify pixel values + expected_shape = torch.Size([1, 3, 800, 1066]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + expected_slice = torch.tensor([0.2796, 0.3138, 0.3481]) + self.assertTrue(torch.allclose(encoding["pixel_values"][0, 0, 0, :3], expected_slice, atol=1e-4)) + + # verify area + expected_area = torch.tensor([147979.6875, 165527.0469, 484638.5938, 11292.9375, 5879.6562, 7634.1147]) + self.assertTrue(torch.allclose(encoding["labels"][0]["area"], expected_area)) + # verify boxes + expected_boxes_shape = torch.Size([6, 4]) + self.assertEqual(encoding["labels"][0]["boxes"].shape, expected_boxes_shape) + expected_boxes_slice = torch.tensor([0.2625, 0.5437, 0.4688, 0.8625]) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"][0], expected_boxes_slice, atol=1e-3)) + # verify image_id + expected_image_id = torch.tensor([39769]) + self.assertTrue(torch.allclose(encoding["labels"][0]["image_id"], expected_image_id)) + # verify is_crowd + expected_is_crowd = torch.tensor([0, 0, 0, 0, 0, 0]) + self.assertTrue(torch.allclose(encoding["labels"][0]["iscrowd"], expected_is_crowd)) + # verify class_labels + expected_class_labels = torch.tensor([17, 17, 63, 75, 75, 93]) + self.assertTrue(torch.allclose(encoding["labels"][0]["class_labels"], expected_class_labels)) + # verify masks + expected_masks_sum = 822873 + self.assertEqual(encoding["labels"][0]["masks"].sum().item(), expected_masks_sum) + # verify orig_size + expected_orig_size = torch.tensor([480, 640]) + self.assertTrue(torch.allclose(encoding["labels"][0]["orig_size"], expected_orig_size)) + # verify size + expected_size = torch.tensor([800, 1066]) + self.assertTrue(torch.allclose(encoding["labels"][0]["size"], expected_size)) + + @slow + # Copied from tests.models.detr.test_image_processing_detr.DetrImageProcessingTest.test_batched_coco_panoptic_annotations with Detr->GroundingDino + def test_batched_coco_panoptic_annotations(self): + # prepare image, target and masks_path + image_0 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + image_1 = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png").resize((800, 800)) + + with open("./tests/fixtures/tests_samples/COCO/coco_panoptic_annotations.txt", "r") as f: + target = json.loads(f.read()) + + annotation_0 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + annotation_1 = {"file_name": "000000039769.png", "image_id": 39769, "segments_info": target} + + w_0, h_0 = image_0.size + w_1, h_1 = image_1.size + for i in range(len(annotation_1["segments_info"])): + coords = annotation_1["segments_info"][i]["bbox"] + new_bbox = [ + coords[0] * w_1 / w_0, + coords[1] * h_1 / h_0, + coords[2] * w_1 / w_0, + coords[3] * h_1 / h_0, + ] + annotation_1["segments_info"][i]["bbox"] = new_bbox + + masks_path = pathlib.Path("./tests/fixtures/tests_samples/COCO/coco_panoptic") + + images = [image_0, image_1] + annotations = [annotation_0, annotation_1] + + # encode them + image_processing = GroundingDinoImageProcessor(format="coco_panoptic") + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_tensors="pt", + return_segmentation_masks=True, + ) + + # Check the pixel values have been padded + postprocessed_height, postprocessed_width = 800, 1066 + expected_shape = torch.Size([2, 3, postprocessed_height, postprocessed_width]) + self.assertEqual(encoding["pixel_values"].shape, expected_shape) + + # Check the bounding boxes have been adjusted for padded images + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + expected_boxes_0 = torch.tensor( + [ + [0.2625, 0.5437, 0.4688, 0.8625], + [0.7719, 0.4104, 0.4531, 0.7125], + [0.5000, 0.4927, 0.9969, 0.9854], + [0.1688, 0.2000, 0.2063, 0.0917], + [0.5492, 0.2760, 0.0578, 0.2187], + [0.4992, 0.4990, 0.9984, 0.9979], + ] + ) + expected_boxes_1 = torch.tensor( + [ + [0.1576, 0.3262, 0.2814, 0.5175], + [0.4634, 0.2463, 0.2720, 0.4275], + [0.3002, 0.2956, 0.5985, 0.5913], + [0.1013, 0.1200, 0.1238, 0.0550], + [0.3297, 0.1656, 0.0347, 0.1312], + [0.2997, 0.2994, 0.5994, 0.5987], + ] + ) + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1e-3)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1e-3)) + + # Check the masks have also been padded + self.assertEqual(encoding["labels"][0]["masks"].shape, torch.Size([6, 800, 1066])) + self.assertEqual(encoding["labels"][1]["masks"].shape, torch.Size([6, 800, 1066])) + + # Check if do_convert_annotations=False, then the annotations are not converted to centre_x, centre_y, width, height + # format and not in the range [0, 1] + encoding = image_processing( + images=images, + annotations=annotations, + masks_path=masks_path, + return_segmentation_masks=True, + do_convert_annotations=False, + return_tensors="pt", + ) + self.assertEqual(encoding["labels"][0]["boxes"].shape, torch.Size([6, 4])) + self.assertEqual(encoding["labels"][1]["boxes"].shape, torch.Size([6, 4])) + # Convert to absolute coordinates + unnormalized_boxes_0 = torch.vstack( + [ + expected_boxes_0[:, 0] * postprocessed_width, + expected_boxes_0[:, 1] * postprocessed_height, + expected_boxes_0[:, 2] * postprocessed_width, + expected_boxes_0[:, 3] * postprocessed_height, + ] + ).T + unnormalized_boxes_1 = torch.vstack( + [ + expected_boxes_1[:, 0] * postprocessed_width, + expected_boxes_1[:, 1] * postprocessed_height, + expected_boxes_1[:, 2] * postprocessed_width, + expected_boxes_1[:, 3] * postprocessed_height, + ] + ).T + # Convert from centre_x, centre_y, width, height to x_min, y_min, x_max, y_max + expected_boxes_0 = torch.vstack( + [ + unnormalized_boxes_0[:, 0] - unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] - unnormalized_boxes_0[:, 3] / 2, + unnormalized_boxes_0[:, 0] + unnormalized_boxes_0[:, 2] / 2, + unnormalized_boxes_0[:, 1] + unnormalized_boxes_0[:, 3] / 2, + ] + ).T + expected_boxes_1 = torch.vstack( + [ + unnormalized_boxes_1[:, 0] - unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] - unnormalized_boxes_1[:, 3] / 2, + unnormalized_boxes_1[:, 0] + unnormalized_boxes_1[:, 2] / 2, + unnormalized_boxes_1[:, 1] + unnormalized_boxes_1[:, 3] / 2, + ] + ).T + self.assertTrue(torch.allclose(encoding["labels"][0]["boxes"], expected_boxes_0, rtol=1)) + self.assertTrue(torch.allclose(encoding["labels"][1]["boxes"], expected_boxes_1, rtol=1)) diff --git a/tests/models/grounding_dino/test_modeling_grounding_dino.py b/tests/models/grounding_dino/test_modeling_grounding_dino.py new file mode 100644 index 0000000000..42486f92da --- /dev/null +++ b/tests/models/grounding_dino/test_modeling_grounding_dino.py @@ -0,0 +1,689 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Testing suite for the PyTorch Grounding DINO model. """ + +import collections +import inspect +import math +import re +import unittest + +from transformers import ( + GroundingDinoConfig, + SwinConfig, + is_torch_available, + is_vision_available, +) +from transformers.file_utils import cached_property +from transformers.testing_utils import ( + require_timm, + require_torch, + require_torch_gpu, + require_vision, + slow, + torch_device, +) + +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, _config_zero_init, floats_tensor, ids_tensor +from ...test_pipeline_mixin import PipelineTesterMixin + + +if is_torch_available(): + import torch + + from transformers import GroundingDinoForObjectDetection, GroundingDinoModel + from transformers.pytorch_utils import id_tensor_storage + + +if is_vision_available(): + from PIL import Image + + from transformers import AutoProcessor + + +class GroundingDinoModelTester: + def __init__( + self, + parent, + batch_size=4, + is_training=True, + use_labels=True, + hidden_size=32, + num_hidden_layers=2, + num_attention_heads=4, + intermediate_size=4, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + num_queries=2, + num_channels=3, + image_size=98, + n_targets=8, + num_labels=3, + num_feature_levels=4, + encoder_n_points=2, + decoder_n_points=6, + max_text_len=7, + ): + self.parent = parent + self.batch_size = batch_size + self.is_training = is_training + self.use_labels = use_labels + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.num_queries = num_queries + self.num_channels = num_channels + self.image_size = image_size + self.n_targets = n_targets + self.num_labels = num_labels + self.num_feature_levels = num_feature_levels + self.encoder_n_points = encoder_n_points + self.decoder_n_points = decoder_n_points + self.max_text_len = max_text_len + + # we also set the expected seq length for both encoder and decoder + self.encoder_seq_length_vision = ( + math.ceil(self.image_size / 8) ** 2 + + math.ceil(self.image_size / 16) ** 2 + + math.ceil(self.image_size / 32) ** 2 + + math.ceil(self.image_size / 64) ** 2 + ) + + self.encoder_seq_length_text = self.max_text_len + + self.decoder_seq_length = self.num_queries + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor([self.batch_size, self.num_channels, self.image_size, self.image_size]) + pixel_mask = torch.ones([self.batch_size, self.image_size, self.image_size], device=torch_device) + + input_ids = ids_tensor([self.batch_size, self.max_text_len], self.num_labels) + + labels = None + if self.use_labels: + # labels is a list of Dict (each Dict being the labels for a given example in the batch) + labels = [] + for i in range(self.batch_size): + target = {} + target["class_labels"] = torch.randint( + high=self.num_labels, size=(self.n_targets,), device=torch_device + ) + target["boxes"] = torch.rand(self.n_targets, 4, device=torch_device) + target["masks"] = torch.rand(self.n_targets, self.image_size, self.image_size, device=torch_device) + labels.append(target) + + config = self.get_config() + return config, pixel_values, pixel_mask, input_ids, labels + + def get_config(self): + swin_config = SwinConfig( + window_size=7, + embed_dim=8, + depths=[1, 1, 1, 1], + num_heads=[1, 1, 1, 1], + image_size=self.image_size, + out_features=["stage2", "stage3", "stage4"], + out_indices=[2, 3, 4], + ) + text_backbone = { + "hidden_size": 8, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "intermediate_size": 8, + "max_position_embeddings": 8, + "model_type": "bert", + } + return GroundingDinoConfig( + d_model=self.hidden_size, + encoder_layers=self.num_hidden_layers, + decoder_layers=self.num_hidden_layers, + encoder_attention_heads=self.num_attention_heads, + decoder_attention_heads=self.num_attention_heads, + encoder_ffn_dim=self.intermediate_size, + decoder_ffn_dim=self.intermediate_size, + dropout=self.hidden_dropout_prob, + attention_dropout=self.attention_probs_dropout_prob, + num_queries=self.num_queries, + num_labels=self.num_labels, + num_feature_levels=self.num_feature_levels, + encoder_n_points=self.encoder_n_points, + decoder_n_points=self.decoder_n_points, + use_timm_backbone=False, + backbone_config=swin_config, + max_text_len=self.max_text_len, + text_config=text_backbone, + ) + + def prepare_config_and_inputs_for_common(self): + config, pixel_values, pixel_mask, input_ids, labels = self.prepare_config_and_inputs() + inputs_dict = {"pixel_values": pixel_values, "pixel_mask": pixel_mask, "input_ids": input_ids} + return config, inputs_dict + + def create_and_check_model(self, config, pixel_values, pixel_mask, input_ids, labels): + model = GroundingDinoModel(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.num_queries, self.hidden_size)) + + def create_and_check_object_detection_head_model(self, config, pixel_values, pixel_mask, input_ids, labels): + model = GroundingDinoForObjectDetection(config=config) + model.to(torch_device) + model.eval() + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids) + + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + result = model(pixel_values=pixel_values, pixel_mask=pixel_mask, input_ids=input_ids, labels=labels) + + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_queries, config.max_text_len)) + self.parent.assertEqual(result.pred_boxes.shape, (self.batch_size, self.num_queries, 4)) + + +@require_torch +class GroundingDinoModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): + all_model_classes = (GroundingDinoModel, GroundingDinoForObjectDetection) if is_torch_available() else () + is_encoder_decoder = True + test_torchscript = False + test_pruning = False + test_head_masking = False + test_missing_keys = False + pipeline_model_mapping = ( + {"image-feature-extraction": GroundingDinoModel, "zero-shot-object-detection": GroundingDinoForObjectDetection} + if is_torch_available() + else {} + ) + + # special case for head models + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False): + inputs_dict = super()._prepare_for_class(inputs_dict, model_class, return_labels=return_labels) + + if return_labels: + if model_class.__name__ == "GroundingDinoForObjectDetection": + labels = [] + for i in range(self.model_tester.batch_size): + target = {} + target["class_labels"] = torch.ones( + size=(self.model_tester.n_targets,), device=torch_device, dtype=torch.long + ) + target["boxes"] = torch.ones( + self.model_tester.n_targets, 4, device=torch_device, dtype=torch.float + ) + target["masks"] = torch.ones( + self.model_tester.n_targets, + self.model_tester.image_size, + self.model_tester.image_size, + device=torch_device, + dtype=torch.float, + ) + labels.append(target) + inputs_dict["labels"] = labels + + return inputs_dict + + def setUp(self): + self.model_tester = GroundingDinoModelTester(self) + self.config_tester = ConfigTester(self, config_class=GroundingDinoConfig, has_text_modality=False) + + def test_config(self): + # we don't test common_properties and arguments_init as these don't apply for Grounding DINO + self.config_tester.create_and_test_config_to_json_string() + self.config_tester.create_and_test_config_to_json_file() + self.config_tester.create_and_test_config_from_and_save_pretrained() + self.config_tester.create_and_test_config_with_num_labels() + self.config_tester.check_config_can_be_init_without_params() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_object_detection_head_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_object_detection_head_model(*config_and_inputs) + + @unittest.skip(reason="Grounding DINO does not use inputs_embeds") + def test_inputs_embeds(self): + pass + + @unittest.skip(reason="Grounding DINO does not have a get_input_embeddings method") + def test_model_common_attributes(self): + pass + + @unittest.skip(reason="Grounding DINO does not use token embeddings") + def test_resize_tokens_embeddings(self): + pass + + @unittest.skip(reason="Feed forward chunking is not implemented") + def test_feed_forward_chunking(self): + pass + + def test_attention_outputs(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + + for model_class in self.all_model_classes: + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = False + config.return_dict = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions[-1] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + # check that output_attentions also work using config + del inputs_dict["output_attentions"] + config.output_attentions = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + attentions = outputs.encoder_attentions[-1] + self.assertEqual(len(attentions), self.model_tester.num_hidden_layers) + + self.assertListEqual( + list(attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + out_len = len(outputs) + + correct_outlen = 10 + + # loss is at first position + if "labels" in inputs_dict: + correct_outlen += 1 # loss is added to beginning + # Object Detection model returns pred_logits and pred_boxes + if model_class.__name__ == "GroundingDinoForObjectDetection": + correct_outlen += 2 + + self.assertEqual(out_len, correct_outlen) + + # decoder attentions + decoder_attentions = outputs.decoder_attentions[0] + self.assertIsInstance(decoder_attentions, (list, tuple)) + self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(decoder_attentions[0].shape[-3:]), + [self.model_tester.num_attention_heads, self.model_tester.num_queries, self.model_tester.num_queries], + ) + + # cross attentions + cross_attentions = outputs.decoder_attentions[-1] + self.assertIsInstance(cross_attentions, (list, tuple)) + self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(cross_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.decoder_n_points, + ], + ) + + # Check attention is always last and order is fine + inputs_dict["output_attentions"] = True + inputs_dict["output_hidden_states"] = True + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + self.assertEqual(out_len + 3, len(outputs)) + + self_attentions = outputs.encoder_attentions[-1] + + self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers) + self.assertListEqual( + list(self_attentions[0].shape[-3:]), + [ + self.model_tester.num_attention_heads, + self.model_tester.num_feature_levels, + self.model_tester.encoder_n_points, + ], + ) + + # overwrite since hidden_states are called encoder_text_hidden_states + def test_hidden_states_output(self): + def check_hidden_states_output(inputs_dict, config, model_class): + model = model_class(config) + model.to(torch_device) + model.eval() + + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + hidden_states = outputs.encoder_vision_hidden_states + + expected_num_layers = getattr( + self.model_tester, "expected_num_hidden_layers", self.model_tester.num_hidden_layers + 1 + ) + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_vision + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + hidden_states = outputs.encoder_text_hidden_states + + self.assertEqual(len(hidden_states), expected_num_layers) + + seq_len = self.model_tester.encoder_seq_length_text + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [seq_len, self.model_tester.hidden_size], + ) + + hidden_states = outputs.decoder_hidden_states + + self.assertIsInstance(hidden_states, (list, tuple)) + self.assertEqual(len(hidden_states), expected_num_layers) + seq_len = getattr(self.model_tester, "seq_length", None) + decoder_seq_length = getattr(self.model_tester, "decoder_seq_length", seq_len) + + self.assertListEqual( + list(hidden_states[0].shape[-2:]), + [decoder_seq_length, self.model_tester.hidden_size], + ) + + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + inputs_dict["output_hidden_states"] = True + check_hidden_states_output(inputs_dict, config, model_class) + + # check that output_hidden_states also work using config + del inputs_dict["output_hidden_states"] + config.output_hidden_states = True + + check_hidden_states_output(inputs_dict, config, model_class) + + # removed retain_grad and grad on decoder_hidden_states, as queries don't require grad + def test_retain_grad_hidden_states_attentions(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.output_hidden_states = True + config.output_attentions = True + + # no need to test all models as different heads yield the same functionality + model_class = self.all_model_classes[0] + model = model_class(config) + model.to(torch_device) + + inputs = self._prepare_for_class(inputs_dict, model_class) + + outputs = model(**inputs) + + output = outputs[0] + + encoder_hidden_states = outputs.encoder_vision_hidden_states[0] + encoder_attentions = outputs.encoder_attentions[0][0] + encoder_hidden_states.retain_grad() + encoder_attentions.retain_grad() + + cross_attentions = outputs.decoder_attentions[-1][0] + cross_attentions.retain_grad() + + output.flatten()[0].backward(retain_graph=True) + + self.assertIsNotNone(encoder_hidden_states.grad) + self.assertIsNotNone(encoder_attentions.grad) + self.assertIsNotNone(cross_attentions.grad) + + def test_forward_signature(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + model = model_class(config) + signature = inspect.signature(model.forward) + # signature.parameters is an OrderedDict => so arg_names order is deterministic + arg_names = [*signature.parameters.keys()] + + expected_arg_names = ["pixel_values", "input_ids"] + self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names) + + def test_different_timm_backbone(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + # let's pick a random timm backbone + config.backbone = "tf_mobilenetv3_small_075" + config.use_timm_backbone = True + config.backbone_config = None + config.backbone_kwargs = {"in_chans": 3, "out_indices": (2, 3, 4)} + + for model_class in self.all_model_classes: + model = model_class(config) + model.to(torch_device) + model.eval() + with torch.no_grad(): + outputs = model(**self._prepare_for_class(inputs_dict, model_class)) + + if model_class.__name__ == "GroundingDinoForObjectDetection": + expected_shape = ( + self.model_tester.batch_size, + self.model_tester.num_queries, + config.max_text_len, + ) + self.assertEqual(outputs.logits.shape, expected_shape) + + self.assertTrue(outputs) + + def test_initialization(self): + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + + configs_no_init = _config_zero_init(config) + for model_class in self.all_model_classes: + model = model_class(config=configs_no_init) + for name, param in model.named_parameters(): + if param.requires_grad: + if ( + "level_embed" in name + or "sampling_offsets.bias" in name + or "text_param" in name + or "vision_param" in name + or "value_proj" in name + or "output_proj" in name + or "reference_points" in name + ): + continue + self.assertIn( + ((param.data.mean() * 1e9).round() / 1e9).item(), + [0.0, 1.0], + msg=f"Parameter {name} of model {model_class} seems not properly initialized", + ) + + # Copied from tests.models.deformable_detr.test_modeling_deformable_detr.DeformableDetrModelTest.test_two_stage_training with DeformableDetr->GroundingDino + def test_two_stage_training(self): + model_class = GroundingDinoForObjectDetection + config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() + config.return_dict = True + config.two_stage = True + config.auxiliary_loss = True + config.with_box_refine = True + + model = model_class(config) + model.to(torch_device) + model.train() + inputs = self._prepare_for_class(inputs_dict, model_class, return_labels=True) + loss = model(**inputs).loss + loss.backward() + + def test_tied_weights_keys(self): + config, _ = self.model_tester.prepare_config_and_inputs_for_common() + config.tie_word_embeddings = True + for model_class in self.all_model_classes: + model_tied = model_class(config) + + ptrs = collections.defaultdict(list) + for name, tensor in model_tied.state_dict().items(): + ptrs[id_tensor_storage(tensor)].append(name) + + # These are all the pointers of shared tensors. + tied_params = [names for _, names in ptrs.items() if len(names) > 1] + + tied_weight_keys = model_tied._tied_weights_keys if model_tied._tied_weights_keys is not None else [] + # Detect we get a hit for each key + for key in tied_weight_keys: + if not any(re.search(key, p) for group in tied_params for p in group): + raise ValueError(f"{key} is not a tied weight key for {model_class}.") + + # Removed tied weights found from tied params -> there should only be one left after + for key in tied_weight_keys: + for i in range(len(tied_params)): + tied_params[i] = [p for p in tied_params[i] if re.search(key, p) is None] + + # GroundingDino when sharing weights also uses the shared ones in GroundingDinoDecoder + # Therefore, differently from DeformableDetr, we expect the group lens to be 2 + # one for self.bbox_embed in GroundingDinoForObejectDetection and another one + # in the decoder + tied_params = [group for group in tied_params if len(group) > 2] + self.assertListEqual( + tied_params, + [], + f"Missing `_tied_weights_keys` for {model_class}: add all of {tied_params} except one.", + ) + + +# We will verify our results on an image of cute cats +def prepare_img(): + image = Image.open("./tests/fixtures/tests_samples/COCO/000000039769.png") + return image + + +def prepare_text(): + text = "a cat." + return text + + +@require_timm +@require_vision +@slow +class GroundingDinoModelIntegrationTests(unittest.TestCase): + @cached_property + def default_processor(self): + return AutoProcessor.from_pretrained("IDEA-Research/grounding-dino-tiny") if is_vision_available() else None + + def test_inference_object_detection_head(self): + model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny").to(torch_device) + + processor = self.default_processor + image = prepare_img() + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt").to(torch_device) + + with torch.no_grad(): + outputs = model(**encoding) + + expected_shape_logits = torch.Size((1, model.config.num_queries, model.config.d_model)) + self.assertEqual(outputs.logits.shape, expected_shape_logits) + + expected_boxes = torch.tensor( + [[0.7674, 0.4136, 0.4572], [0.2566, 0.5463, 0.4760], [0.2585, 0.5442, 0.4641]] + ).to(torch_device) + expected_logits = torch.tensor( + [[-4.8913, -0.1900, -0.2161], [-4.9653, -0.3719, -0.3950], [-5.9599, -3.3765, -3.3104]] + ).to(torch_device) + + self.assertTrue(torch.allclose(outputs.logits[0, :3, :3], expected_logits, atol=1e-3)) + + expected_shape_boxes = torch.Size((1, model.config.num_queries, 4)) + self.assertEqual(outputs.pred_boxes.shape, expected_shape_boxes) + self.assertTrue(torch.allclose(outputs.pred_boxes[0, :3, :3], expected_boxes, atol=1e-4)) + + # verify postprocessing + results = processor.image_processor.post_process_object_detection( + outputs, threshold=0.35, target_sizes=[image.size[::-1]] + )[0] + expected_scores = torch.tensor([0.4526, 0.4082]).to(torch_device) + expected_slice_boxes = torch.tensor([344.8143, 23.1796, 637.4004, 373.8295]).to(torch_device) + + self.assertEqual(len(results["scores"]), 2) + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2)) + + # verify grounded postprocessing + expected_labels = ["a cat", "a cat"] + results = processor.post_process_grounded_object_detection( + outputs=outputs, + input_ids=encoding.input_ids, + box_threshold=0.35, + text_threshold=0.3, + target_sizes=[image.size[::-1]], + )[0] + + self.assertTrue(torch.allclose(results["scores"], expected_scores, atol=1e-3)) + self.assertTrue(torch.allclose(results["boxes"][0, :], expected_slice_boxes, atol=1e-2)) + self.assertListEqual(results["labels"], expected_labels) + + @require_torch_gpu + def test_inference_object_detection_head_equivalence_cpu_gpu(self): + processor = self.default_processor + image = prepare_img() + text = prepare_text() + encoding = processor(images=image, text=text, return_tensors="pt") + + # 1. run model on CPU + model = GroundingDinoForObjectDetection.from_pretrained("IDEA-Research/grounding-dino-tiny") + + with torch.no_grad(): + cpu_outputs = model(**encoding) + + # 2. run model on GPU + model.to("cuda") + encoding = encoding.to("cuda") + with torch.no_grad(): + gpu_outputs = model(**encoding) + + # 3. assert equivalence + for key in cpu_outputs.keys(): + self.assertTrue(torch.allclose(cpu_outputs[key], gpu_outputs[key].cpu(), atol=1e-3)) + + expected_logits = torch.tensor( + [[-4.8915, -0.1900, -0.2161], [-4.9658, -0.3716, -0.3948], [-5.9596, -3.3763, -3.3103]] + ) + self.assertTrue(torch.allclose(cpu_outputs.logits[0, :3, :3], expected_logits, atol=1e-3)) + + # assert postprocessing + results_cpu = processor.image_processor.post_process_object_detection( + cpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]] + )[0] + + result_gpu = processor.image_processor.post_process_object_detection( + gpu_outputs, threshold=0.35, target_sizes=[image.size[::-1]] + )[0] + + self.assertTrue(torch.allclose(results_cpu["scores"], result_gpu["scores"].cpu(), atol=1e-3)) + self.assertTrue(torch.allclose(results_cpu["boxes"], result_gpu["boxes"].cpu(), atol=1e-3)) diff --git a/tests/models/grounding_dino/test_processor_grounding_dino.py b/tests/models/grounding_dino/test_processor_grounding_dino.py new file mode 100644 index 0000000000..a788d09ca7 --- /dev/null +++ b/tests/models/grounding_dino/test_processor_grounding_dino.py @@ -0,0 +1,253 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import shutil +import tempfile +import unittest + +import numpy as np +import pytest + +from transformers import BertTokenizer, BertTokenizerFast, GroundingDinoProcessor +from transformers.models.bert.tokenization_bert import VOCAB_FILES_NAMES +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import IMAGE_PROCESSOR_NAME, is_torch_available, is_vision_available + + +if is_torch_available(): + import torch + + from transformers.models.grounding_dino.modeling_grounding_dino import GroundingDinoObjectDetectionOutput + +if is_vision_available(): + from PIL import Image + + from transformers import GroundingDinoImageProcessor + + +@require_torch +@require_vision +class GroundingDinoProcessorTest(unittest.TestCase): + def setUp(self): + self.tmpdirname = tempfile.mkdtemp() + + vocab_tokens = ["[UNK]","[CLS]","[SEP]","[PAD]","[MASK]","want","##want","##ed","wa","un","runn","##ing",",","low","lowest"] # fmt: skip + self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) + with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: + vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) + + image_processor_map = { + "do_resize": True, + "size": None, + "do_normalize": True, + "image_mean": [0.5, 0.5, 0.5], + "image_std": [0.5, 0.5, 0.5], + "do_rescale": True, + "rescale_factor": 1 / 255, + "do_pad": True, + } + self.image_processor_file = os.path.join(self.tmpdirname, IMAGE_PROCESSOR_NAME) + with open(self.image_processor_file, "w", encoding="utf-8") as fp: + json.dump(image_processor_map, fp) + + self.batch_size = 7 + self.num_queries = 5 + self.embed_dim = 5 + self.seq_length = 5 + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_tokenizer with CLIP->Bert + def get_tokenizer(self, **kwargs): + return BertTokenizer.from_pretrained(self.tmpdirname, **kwargs) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_rust_tokenizer with CLIP->Bert + def get_rust_tokenizer(self, **kwargs): + return BertTokenizerFast.from_pretrained(self.tmpdirname, **kwargs) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.get_image_processor with CLIP->GroundingDino + def get_image_processor(self, **kwargs): + return GroundingDinoImageProcessor.from_pretrained(self.tmpdirname, **kwargs) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.tearDown + def tearDown(self): + shutil.rmtree(self.tmpdirname) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.prepare_image_inputs + def prepare_image_inputs(self): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + """ + + image_inputs = [np.random.randint(255, size=(3, 30, 400), dtype=np.uint8)] + + image_inputs = [Image.fromarray(np.moveaxis(x, 0, -1)) for x in image_inputs] + + return image_inputs + + def get_fake_grounding_dino_output(self): + torch.manual_seed(42) + return GroundingDinoObjectDetectionOutput( + pred_boxes=torch.rand(self.batch_size, self.num_queries, 4), + logits=torch.rand(self.batch_size, self.num_queries, self.embed_dim), + ) + + def get_fake_grounding_dino_input_ids(self): + input_ids = torch.tensor([101, 1037, 4937, 1012, 102]) + return torch.stack([input_ids] * self.batch_size, dim=0) + + def test_post_process_grounded_object_detection(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + grounding_dino_output = self.get_fake_grounding_dino_output() + grounding_dino_input_ids = self.get_fake_grounding_dino_input_ids() + + post_processed = processor.post_process_grounded_object_detection( + grounding_dino_output, grounding_dino_input_ids + ) + + self.assertEqual(len(post_processed), self.batch_size) + self.assertEqual(list(post_processed[0].keys()), ["scores", "labels", "boxes"]) + self.assertEqual(post_processed[0]["boxes"].shape, (self.num_queries, 4)) + self.assertEqual(post_processed[0]["scores"].shape, (self.num_queries,)) + + expected_scores = torch.tensor([0.7050, 0.7222, 0.7222, 0.6829, 0.7220]) + self.assertTrue(torch.allclose(post_processed[0]["scores"], expected_scores, atol=1e-4)) + + expected_box_slice = torch.tensor([0.6908, 0.4354, 1.0737, 1.3947]) + self.assertTrue(torch.allclose(post_processed[0]["boxes"][0], expected_box_slice, atol=1e-4)) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_default with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer + def test_save_load_pretrained_default(self): + tokenizer_slow = self.get_tokenizer() + tokenizer_fast = self.get_rust_tokenizer() + image_processor = self.get_image_processor() + + processor_slow = GroundingDinoProcessor(tokenizer=tokenizer_slow, image_processor=image_processor) + processor_slow.save_pretrained(self.tmpdirname) + processor_slow = GroundingDinoProcessor.from_pretrained(self.tmpdirname, use_fast=False) + + processor_fast = GroundingDinoProcessor(tokenizer=tokenizer_fast, image_processor=image_processor) + processor_fast.save_pretrained(self.tmpdirname) + processor_fast = GroundingDinoProcessor.from_pretrained(self.tmpdirname) + + self.assertEqual(processor_slow.tokenizer.get_vocab(), tokenizer_slow.get_vocab()) + self.assertEqual(processor_fast.tokenizer.get_vocab(), tokenizer_fast.get_vocab()) + self.assertEqual(tokenizer_slow.get_vocab(), tokenizer_fast.get_vocab()) + self.assertIsInstance(processor_slow.tokenizer, BertTokenizer) + self.assertIsInstance(processor_fast.tokenizer, BertTokenizerFast) + + self.assertEqual(processor_slow.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertEqual(processor_fast.image_processor.to_json_string(), image_processor.to_json_string()) + self.assertIsInstance(processor_slow.image_processor, GroundingDinoImageProcessor) + self.assertIsInstance(processor_fast.image_processor, GroundingDinoImageProcessor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_save_load_pretrained_additional_features with CLIP->GroundingDino,GroundingDinoTokenizer->BertTokenizer + def test_save_load_pretrained_additional_features(self): + processor = GroundingDinoProcessor(tokenizer=self.get_tokenizer(), image_processor=self.get_image_processor()) + processor.save_pretrained(self.tmpdirname) + + tokenizer_add_kwargs = self.get_tokenizer(bos_token="(BOS)", eos_token="(EOS)") + image_processor_add_kwargs = self.get_image_processor(do_normalize=False, padding_value=1.0) + + processor = GroundingDinoProcessor.from_pretrained( + self.tmpdirname, bos_token="(BOS)", eos_token="(EOS)", do_normalize=False, padding_value=1.0 + ) + + self.assertEqual(processor.tokenizer.get_vocab(), tokenizer_add_kwargs.get_vocab()) + self.assertIsInstance(processor.tokenizer, BertTokenizerFast) + + self.assertEqual(processor.image_processor.to_json_string(), image_processor_add_kwargs.to_json_string()) + self.assertIsInstance(processor.image_processor, GroundingDinoImageProcessor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_image_processor with CLIP->GroundingDino + def test_image_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + image_input = self.prepare_image_inputs() + + input_image_proc = image_processor(image_input, return_tensors="np") + input_processor = processor(images=image_input, return_tensors="np") + + for key in input_image_proc.keys(): + self.assertAlmostEqual(input_image_proc[key].sum(), input_processor[key].sum(), delta=1e-2) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer with CLIP->GroundingDino + def test_tokenizer(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + + encoded_processor = processor(text=input_str) + + encoded_tok = tokenizer(input_str) + + for key in encoded_tok.keys(): + self.assertListEqual(encoded_tok[key], encoded_processor[key]) + + def test_processor(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual( + list(inputs.keys()), ["input_ids", "token_type_ids", "attention_mask", "pixel_values", "pixel_mask"] + ) + + # test if it raises when no input is passed + with pytest.raises(ValueError): + processor() + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_tokenizer_decode with CLIP->GroundingDino + def test_tokenizer_decode(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + predicted_ids = [[1, 4, 5, 8, 1, 0, 8], [3, 4, 3, 1, 1, 8, 9]] + + decoded_processor = processor.batch_decode(predicted_ids) + decoded_tok = tokenizer.batch_decode(predicted_ids) + + self.assertListEqual(decoded_tok, decoded_processor) + + # Copied from tests.models.clip.test_processor_clip.CLIPProcessorTest.test_model_input_names with CLIP->GroundingDino + def test_model_input_names(self): + image_processor = self.get_image_processor() + tokenizer = self.get_tokenizer() + + processor = GroundingDinoProcessor(tokenizer=tokenizer, image_processor=image_processor) + + input_str = "lower newer" + image_input = self.prepare_image_inputs() + + inputs = processor(text=input_str, images=image_input) + + self.assertListEqual(list(inputs.keys()), processor.model_input_names) From fbdb978eb5b42686f8ad858c13c8f6f7209b5c84 Mon Sep 17 00:00:00 2001 From: lewtun Date: Thu, 11 Apr 2024 10:38:24 +0200 Subject: [PATCH 081/110] Fix Llava chat template examples (#30130) --- docs/source/en/model_doc/llava.md | 4 ++-- src/transformers/models/llava/modeling_llava.py | 9 +++++---- tests/models/llava/test_modeling_llava.py | 12 ++++++------ 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/docs/source/en/model_doc/llava.md b/docs/source/en/model_doc/llava.md index ee7d9bbd1a..0ca6382714 100644 --- a/docs/source/en/model_doc/llava.md +++ b/docs/source/en/model_doc/llava.md @@ -43,13 +43,13 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/ - For better results, we recommend users to prompt the model with the correct prompt format: ```bash -"USER: \nASSISTANT:" +"USER: \n ASSISTANT:" ``` For multiple turns conversation: ```bash -"USER: \nASSISTANT: USER: ASSISTANT: USER: ASSISTANT:" +"USER: \n ASSISTANT: USER: ASSISTANT: USER: ASSISTANT:" ``` ### Using Flash Attention 2 diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index f195c1140b..4cf5d98f77 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -12,7 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" PyTorch Llava model.""" +"""PyTorch Llava model.""" + from dataclasses import dataclass from typing import List, Optional, Tuple, Union @@ -388,16 +389,16 @@ class LlavaForConditionalGeneration(LlavaPreTrainedModel): >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf") >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") - >>> prompt = "\nUSER: What's the content of the image?\nASSISTANT:" + >>> prompt = "USER: \nWhat's the content of the image? ASSISTANT:" >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> inputs = processor(text=prompt, images=image, return_tensors="pt") >>> # Generate - >>> generate_ids = model.generate(**inputs, max_length=30) + >>> generate_ids = model.generate(**inputs, max_new_tokens=15) >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner" + "USER: \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed" ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/tests/models/llava/test_modeling_llava.py b/tests/models/llava/test_modeling_llava.py index d6bb2b56ac..ce432e0599 100644 --- a/tests/models/llava/test_modeling_llava.py +++ b/tests/models/llava/test_modeling_llava.py @@ -12,7 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Testing suite for the PyTorch Llava model. """ +"""Testing suite for the PyTorch Llava model.""" import copy import gc @@ -398,13 +398,13 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf", load_in_4bit=True) processor = AutoProcessor.from_pretrained(model_id) - prompt = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT:" + prompt = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT:" image_file = "https://llava-vl.github.io/static/images/view.jpg" raw_image = Image.open(requests.get(image_file, stream=True).raw) inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16) output = model.generate(**inputs, max_new_tokens=900, do_sample=False) - EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the presence of wildlife, such as birds or fish, and avoid disturbing their natural habitats. Lastly, be aware of any local regulations or guidelines for the use of the pier, as some areas may be restricted or prohibited for certain activities." # fmt: skip + EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip self.assertEqual( processor.decode(output[0], skip_special_tokens=True), @@ -421,8 +421,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): processor = AutoProcessor.from_pretrained(model_id) prompts = [ - "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT:", - "USER: \nWhat is this?\nASSISTANT:", + "USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT:", + "USER: \nWhat is this? ASSISTANT:", ] image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw) image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) @@ -431,7 +431,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase): output = model.generate(**inputs, max_new_tokens=20) - EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me?\nASSISTANT: When visiting this place, which appears to be a dock or pier extending over a body of water', 'USER: \nWhat is this?\nASSISTANT: The image features two cats lying down on a pink couch. One cat is located on'] # fmt: skip + EXPECTED_DECODED_TEXT = ['USER: \nWhat are the things I should be cautious about when I visit this place? What should I bring with me? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, you', 'USER: \nWhat is this? ASSISTANT: The image features two cats lying down on a pink couch. One cat is located on'] # fmt: skip self.assertEqual(processor.batch_decode(output, skip_special_tokens=True), EXPECTED_DECODED_TEXT) From e50be9a05808d751dc0a1fe5169d0af7039ab04b Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Thu, 11 Apr 2024 04:49:16 -0400 Subject: [PATCH 082/110] Guard XLA version imports (#30167) --- src/transformers/trainer.py | 13 +++++++++++-- src/transformers/utils/__init__.py | 1 + src/transformers/utils/import_utils.py | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 292ecad383..5b8ffeafc7 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -136,6 +136,7 @@ from .utils import ( SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME, + XLA_FSDPV2_MIN_VERSION, PushInProgress, PushToHubMixin, can_return_loss, @@ -179,8 +180,14 @@ if is_datasets_available(): if is_torch_xla_available(): import torch_xla.core.xla_model as xm import torch_xla.debug.metrics as met - import torch_xla.distributed.spmd as xs - import torch_xla.runtime as xr + from torch_xla import __version__ as XLA_VERSION + + IS_XLA_FSDPV2_POST_2_2 = version.parse(XLA_VERSION) >= version.parse(XLA_FSDPV2_MIN_VERSION) + if IS_XLA_FSDPV2_POST_2_2: + import torch_xla.distributed.spmd as xs + import torch_xla.runtime as xr +else: + IS_XLA_FSDPV2_POST_2_2 = False if is_sagemaker_mp_enabled(): @@ -664,6 +671,8 @@ class Trainer: self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False) if self.is_fsdp_xla_v2_enabled: + if not IS_XLA_FSDPV2_POST_2_2: + raise ValueError("FSDPv2 requires `torch_xla` 2.2 or higher.") # Prepare the SPMD mesh that is going to be used by the data loader and the FSDPv2 wrapper. # Tensor axis is just a placeholder where it will not be used in FSDPv2. num_devices = xr.global_runtime_device_count() diff --git a/src/transformers/utils/__init__.py b/src/transformers/utils/__init__.py index d33a673245..121c4dc136 100644 --- a/src/transformers/utils/__init__.py +++ b/src/transformers/utils/__init__.py @@ -98,6 +98,7 @@ from .import_utils import ( USE_JAX, USE_TF, USE_TORCH, + XLA_FSDPV2_MIN_VERSION, DummyObject, OptionalDependencyNotAvailable, _LazyModule, diff --git a/src/transformers/utils/import_utils.py b/src/transformers/utils/import_utils.py index 486df11185..a8c45aeac3 100644 --- a/src/transformers/utils/import_utils.py +++ b/src/transformers/utils/import_utils.py @@ -89,6 +89,7 @@ TORCH_FX_REQUIRED_VERSION = version.parse("1.10") ACCELERATE_MIN_VERSION = "0.21.0" FSDP_MIN_VERSION = "1.12.0" +XLA_FSDPV2_MIN_VERSION = "2.2.0" _accelerate_available, _accelerate_version = _is_package_available("accelerate", return_version=True) From 58b170cdb19cf97e1eabf9dfa34a03ea80fbcef9 Mon Sep 17 00:00:00 2001 From: hugehope <166521727+hugehope@users.noreply.github.com> Date: Thu, 11 Apr 2024 16:49:36 +0800 Subject: [PATCH 083/110] chore: remove repetitive words (#30174) Signed-off-by: hugehope --- src/transformers/models/canine/modeling_canine.py | 2 +- src/transformers/models/mamba/configuration_mamba.py | 2 +- src/transformers/models/rwkv/configuration_rwkv.py | 2 +- src/transformers/optimization.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/transformers/models/canine/modeling_canine.py b/src/transformers/models/canine/modeling_canine.py index 023287153a..39d89c6e0b 100644 --- a/src/transformers/models/canine/modeling_canine.py +++ b/src/transformers/models/canine/modeling_canine.py @@ -608,7 +608,7 @@ class CanineAttention(nn.Module): chunk_end = min(from_seq_length, chunk_start + self.attend_from_chunk_width) from_chunks.append((chunk_start, chunk_end)) - # Determine the chunks (windows) that will will attend *to*. + # Determine the chunks (windows) that will attend *to*. to_chunks = [] if self.first_position_attends_to_all: to_chunks.append((0, to_seq_length)) diff --git a/src/transformers/models/mamba/configuration_mamba.py b/src/transformers/models/mamba/configuration_mamba.py index 695d9a6273..b3e9b4eb94 100644 --- a/src/transformers/models/mamba/configuration_mamba.py +++ b/src/transformers/models/mamba/configuration_mamba.py @@ -67,7 +67,7 @@ class MambaConfig(PretrainedConfig): residual_in_fp32 (`bool`, *optional*, defaults to `True`): Whether or not residuals should be in `float32`. If set to `False` residuals will keep the same `dtype` as the rest of the model time_step_rank (`Union[int,str]`, *optional*, defaults to `"auto"`): - Rank of the the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` + Rank of the discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)` time_step_scale (`float`, *optional*, defaults to 1.0): Scale used used to scale `dt_proj.bias`. time_step_min (`float`, *optional*, defaults to 0.001): diff --git a/src/transformers/models/rwkv/configuration_rwkv.py b/src/transformers/models/rwkv/configuration_rwkv.py index a6abfc549e..5e0598dad5 100644 --- a/src/transformers/models/rwkv/configuration_rwkv.py +++ b/src/transformers/models/rwkv/configuration_rwkv.py @@ -41,7 +41,7 @@ class RwkvConfig(PretrainedConfig): Vocabulary size of the RWKV model. Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling [`RwkvModel`]. context_length (`int`, *optional*, defaults to 1024): - The maximum sequence length that this model can be be used with in a single forward (using it in RNN mode + The maximum sequence length that this model can be used with in a single forward (using it in RNN mode lets use any sequence length). hidden_size (`int`, *optional*, defaults to 4096): Dimensionality of the embeddings and hidden states. diff --git a/src/transformers/optimization.py b/src/transformers/optimization.py index ce9f9b78dc..3727784fba 100644 --- a/src/transformers/optimization.py +++ b/src/transformers/optimization.py @@ -273,7 +273,7 @@ def get_polynomial_decay_schedule_with_warmup( lr_init = optimizer.defaults["lr"] if not (lr_init > lr_end): - raise ValueError(f"lr_end ({lr_end}) must be be smaller than initial lr ({lr_init})") + raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})") lr_lambda = partial( _get_polynomial_decay_schedule_with_warmup_lr_lambda, From e516d1b19d035469b4852e34ba0356587e6f8ade Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Thu, 11 Apr 2024 17:17:10 +0530 Subject: [PATCH 084/110] fix: Fixed `ruff` configuration to avoid deprecated configuration warning (#30179) * Fixed deprecated ruff configuration in pyproject.toml file * reverted un-necessary changes. * small fix. --- pyproject.toml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index d66b89769c..d709ba0a49 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,16 +1,18 @@ [tool.ruff] +line-length = 119 + +[tool.ruff.lint] # Never enforce `E501` (line length violations). ignore = ["C901", "E501", "E741", "F402", "F823" ] select = ["C", "E", "F", "I", "W"] -line-length = 119 # Ignore import violations in all `__init__.py` files. -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["E402", "F401", "F403", "F811"] "src/transformers/file_utils.py" = ["F401"] "src/transformers/utils/dummy_*.py" = ["F401"] -[tool.ruff.isort] +[tool.ruff.lint.isort] lines-after-imports = 2 known-first-party = ["transformers"] @@ -33,4 +35,4 @@ doctest_glob="**/*.md" markers = [ "flash_attn_test: marks tests related to flash attention (deselect with '-m \"not flash_attn_test\"')", "bitsandbytes: select (or deselect with `not`) bitsandbytes integration tests", -] \ No newline at end of file +] From 386ef34e7d35676c0f7ac3305b156e46a3255042 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Thu, 11 Apr 2024 14:24:38 +0200 Subject: [PATCH 085/110] [Processor classes] Update docs (#29698) Update docs --- src/transformers/models/align/processing_align.py | 3 +-- src/transformers/models/altclip/processing_altclip.py | 3 +-- .../models/chinese_clip/processing_chinese_clip.py | 3 +-- src/transformers/models/clip/processing_clip.py | 3 +-- src/transformers/models/clipseg/processing_clipseg.py | 3 +-- src/transformers/models/fuyu/processing_fuyu.py | 3 +-- src/transformers/models/git/processing_git.py | 3 +-- src/transformers/models/llava/processing_llava.py | 3 +-- src/transformers/models/oneformer/processing_oneformer.py | 3 +-- src/transformers/models/owlv2/processing_owlv2.py | 3 +-- src/transformers/models/owlvit/processing_owlvit.py | 3 +-- src/transformers/models/siglip/processing_siglip.py | 3 +-- .../processing_vision_text_dual_encoder.py | 3 +-- 13 files changed, 13 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py index 0863c11310..8bcea7eb5d 100644 --- a/src/transformers/models/align/processing_align.py +++ b/src/transformers/models/align/processing_align.py @@ -57,8 +57,7 @@ class AlignProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`): Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`, `'max_length'`, `False` or `'do_not_pad'`] diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py index e9b4f45269..9518c55d40 100644 --- a/src/transformers/models/altclip/processing_altclip.py +++ b/src/transformers/models/altclip/processing_altclip.py @@ -73,8 +73,7 @@ class AltCLIPProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: diff --git a/src/transformers/models/chinese_clip/processing_chinese_clip.py b/src/transformers/models/chinese_clip/processing_chinese_clip.py index 832f44102a..1f44fc50ae 100644 --- a/src/transformers/models/chinese_clip/processing_chinese_clip.py +++ b/src/transformers/models/chinese_clip/processing_chinese_clip.py @@ -75,8 +75,7 @@ class ChineseCLIPProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 31351f31ef..33fe25a21e 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -73,8 +73,7 @@ class CLIPProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: diff --git a/src/transformers/models/clipseg/processing_clipseg.py b/src/transformers/models/clipseg/processing_clipseg.py index e57021f213..f8eaca8233 100644 --- a/src/transformers/models/clipseg/processing_clipseg.py +++ b/src/transformers/models/clipseg/processing_clipseg.py @@ -73,8 +73,7 @@ class CLIPSegProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. visual_prompt (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The visual prompt image or batch of images to be prepared. Each visual prompt image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape diff --git a/src/transformers/models/fuyu/processing_fuyu.py b/src/transformers/models/fuyu/processing_fuyu.py index f7078554cb..ffa215f1a0 100644 --- a/src/transformers/models/fuyu/processing_fuyu.py +++ b/src/transformers/models/fuyu/processing_fuyu.py @@ -482,8 +482,7 @@ class FuyuProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `List[PIL.Image.Image]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. Returns: [`FuyuBatchEncoding`]: A [`FuyuBatchEncoding`] with the following fields: diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py index 2f0851c062..79f26f3bf2 100644 --- a/src/transformers/models/git/processing_git.py +++ b/src/transformers/models/git/processing_git.py @@ -57,8 +57,7 @@ class GitProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 1ba1b30e65..62a46acd39 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -70,8 +70,7 @@ class LlavaProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: diff --git a/src/transformers/models/oneformer/processing_oneformer.py b/src/transformers/models/oneformer/processing_oneformer.py index dc20f48f68..9e55be5d67 100644 --- a/src/transformers/models/oneformer/processing_oneformer.py +++ b/src/transformers/models/oneformer/processing_oneformer.py @@ -91,8 +91,7 @@ class OneFormerProcessor(ProcessorMixin): images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. segmentation_maps (`ImageInput`, *optional*): The corresponding semantic segmentation maps with the pixel-wise annotations. diff --git a/src/transformers/models/owlv2/processing_owlv2.py b/src/transformers/models/owlv2/processing_owlv2.py index 77493f6cb2..8b580ca502 100644 --- a/src/transformers/models/owlv2/processing_owlv2.py +++ b/src/transformers/models/owlv2/processing_owlv2.py @@ -62,8 +62,7 @@ class Owlv2Processor(ProcessorMixin): images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image diff --git a/src/transformers/models/owlvit/processing_owlvit.py b/src/transformers/models/owlvit/processing_owlvit.py index 670f7206fd..2c7d490104 100644 --- a/src/transformers/models/owlvit/processing_owlvit.py +++ b/src/transformers/models/owlvit/processing_owlvit.py @@ -77,8 +77,7 @@ class OwlViTProcessor(ProcessorMixin): images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. query_images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The query image to be prepared, one query image is expected per target image to be queried. Each image can be a PIL image, NumPy array or PyTorch tensor. In case of a NumPy array/PyTorch tensor, each image diff --git a/src/transformers/models/siglip/processing_siglip.py b/src/transformers/models/siglip/processing_siglip.py index f21cf73548..655fb4d4f7 100644 --- a/src/transformers/models/siglip/processing_siglip.py +++ b/src/transformers/models/siglip/processing_siglip.py @@ -69,8 +69,7 @@ class SiglipProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): Select a strategy to pad the returned sequences (according to the model's padding side and padding index) among: diff --git a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py index 322c13aadc..0d723ed10b 100644 --- a/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py +++ b/src/transformers/models/vision_text_dual_encoder/processing_vision_text_dual_encoder.py @@ -76,8 +76,7 @@ class VisionTextDualEncoderProcessor(ProcessorMixin): `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a - number of channels, H and W are image height and width. + tensor. Both channels-first and channels-last formats are supported. return_tensors (`str` or [`~utils.TensorType`], *optional*): If set, will return tensors of a particular framework. Acceptable values are: From 5569552cf8779c8951326b2fa9b7a1d64b1005c9 Mon Sep 17 00:00:00 2001 From: NielsRogge <48327001+NielsRogge@users.noreply.github.com> Date: Thu, 11 Apr 2024 14:59:30 +0200 Subject: [PATCH 086/110] Update output of SuperPointForKeypointDetection (#29809) * Remove auto class * Update ImagePointDescriptionOutput * Update model outputs * Rename output class * Revert "Remove auto class" This reverts commit ed4a8f549d79cdb0cdf7aa74205a185c41471519. * Address comments --- .../models/superpoint/modeling_superpoint.py | 29 +++++++------------ .../superpoint/test_modeling_superpoint.py | 20 ++++++++----- 2 files changed, 23 insertions(+), 26 deletions(-) diff --git a/src/transformers/models/superpoint/modeling_superpoint.py b/src/transformers/models/superpoint/modeling_superpoint.py index a4350e6d79..3e3fdbbf10 100644 --- a/src/transformers/models/superpoint/modeling_superpoint.py +++ b/src/transformers/models/superpoint/modeling_superpoint.py @@ -79,7 +79,7 @@ def simple_nms(scores: torch.Tensor, nms_radius: int) -> torch.Tensor: @dataclass -class ImagePointDescriptionOutput(ModelOutput): +class SuperPointKeypointDescriptionOutput(ModelOutput): """ Base class for outputs of image point description models. Due to the nature of keypoint detection, the number of keypoints is not fixed and can vary from image to image, which makes batching non-trivial. In the batch of images, @@ -88,8 +88,8 @@ class ImagePointDescriptionOutput(ModelOutput): and which are padding. Args: - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the decoder of the model. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*): + Loss computed during training. keypoints (`torch.FloatTensor` of shape `(batch_size, num_keypoints, 2)`): Relative (x, y) coordinates of predicted keypoints in a given image. scores (`torch.FloatTensor` of shape `(batch_size, num_keypoints)`): @@ -105,7 +105,7 @@ class ImagePointDescriptionOutput(ModelOutput): (also called feature maps) of the model at the output of each stage. """ - last_hidden_state: torch.FloatTensor = None + loss: Optional[torch.FloatTensor] = None keypoints: Optional[torch.IntTensor] = None scores: Optional[torch.FloatTensor] = None descriptors: Optional[torch.FloatTensor] = None @@ -414,11 +414,11 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel): @add_start_docstrings_to_model_forward(SUPERPOINT_INPUTS_DOCSTRING) def forward( self, - pixel_values: torch.FloatTensor = None, + pixel_values: torch.FloatTensor, labels: Optional[torch.LongTensor] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, - ) -> Union[Tuple, ImagePointDescriptionOutput]: + ) -> Union[Tuple, SuperPointKeypointDescriptionOutput]: """ Examples: @@ -437,20 +437,15 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel): >>> inputs = processor(image, return_tensors="pt") >>> outputs = model(**inputs) ```""" - + loss = None if labels is not None: - raise ValueError( - f"SuperPoint is not trainable, no labels should be provided.Therefore, labels should be None but were {type(labels)}" - ) + raise ValueError("SuperPoint does not support training for now.") output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict - if pixel_values is None: - raise ValueError("You have to specify pixel_values") - pixel_values = self.extract_one_channel_pixel_values(pixel_values) batch_size = pixel_values.shape[0] @@ -493,12 +488,10 @@ class SuperPointForKeypointDetection(SuperPointPreTrainedModel): hidden_states = encoder_outputs[1] if output_hidden_states else None if not return_dict: - return tuple( - v for v in [last_hidden_state, keypoints, scores, descriptors, mask, hidden_states] if v is not None - ) + return tuple(v for v in [loss, keypoints, scores, descriptors, mask, hidden_states] if v is not None) - return ImagePointDescriptionOutput( - last_hidden_state=last_hidden_state, + return SuperPointKeypointDescriptionOutput( + loss=loss, keypoints=keypoints, scores=scores, descriptors=descriptors, diff --git a/tests/models/superpoint/test_modeling_superpoint.py b/tests/models/superpoint/test_modeling_superpoint.py index cb204d3f89..080eda385b 100644 --- a/tests/models/superpoint/test_modeling_superpoint.py +++ b/tests/models/superpoint/test_modeling_superpoint.py @@ -85,13 +85,17 @@ class SuperPointModelTester: border_removal_distance=self.border_removal_distance, ) - def create_and_check_model(self, config, pixel_values): + def create_and_check_keypoint_detection(self, config, pixel_values): model = SuperPointForKeypointDetection(config=config) model.to(torch_device) model.eval() result = model(pixel_values) + self.parent.assertEqual(result.keypoints.shape[0], self.batch_size) + self.parent.assertEqual(result.keypoints.shape[-1], 2) + + result = model(pixel_values, output_hidden_states=True) self.parent.assertEqual( - result.last_hidden_state.shape, + result.hidden_states[-1].shape, ( self.batch_size, self.encoder_hidden_sizes[-1], @@ -146,19 +150,19 @@ class SuperPointModelTest(ModelTesterMixin, unittest.TestCase): def test_feed_forward_chunking(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection is not trainable") + @unittest.skip(reason="SuperPointForKeypointDetection does not support training") def test_training(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection is not trainable") + @unittest.skip(reason="SuperPointForKeypointDetection does not support training") def test_training_gradient_checkpointing(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection is not trainable") + @unittest.skip(reason="SuperPointForKeypointDetection does not support training") def test_training_gradient_checkpointing_use_reentrant(self): pass - @unittest.skip(reason="SuperPointForKeypointDetection is not trainable") + @unittest.skip(reason="SuperPointForKeypointDetection does not support training") def test_training_gradient_checkpointing_use_reentrant_false(self): pass @@ -166,9 +170,9 @@ class SuperPointModelTest(ModelTesterMixin, unittest.TestCase): def test_retain_grad_hidden_states_attentions(self): pass - def test_model(self): + def test_keypoint_detection(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() - self.model_tester.create_and_check_model(*config_and_inputs) + self.model_tester.create_and_check_keypoint_detection(*config_and_inputs) def test_forward_signature(self): config, _ = self.model_tester.prepare_config_and_inputs() From edf0935dca9189db599ac6c3f3ef714160acbbd8 Mon Sep 17 00:00:00 2001 From: Sam Shleifer Date: Thu, 11 Apr 2024 06:04:46 -0700 Subject: [PATCH 087/110] Falcon: make activation, ffn_hidden_size configurable (#30134) * Falcon chg * delta * Docstring * Fix import block * doc * fix and overwrite --- .../models/falcon/configuration_falcon.py | 16 ++++++++++++++-- .../models/falcon/modeling_falcon.py | 7 ++++--- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/falcon/configuration_falcon.py b/src/transformers/models/falcon/configuration_falcon.py index 23ed4acb5e..61d202b096 100644 --- a/src/transformers/models/falcon/configuration_falcon.py +++ b/src/transformers/models/falcon/configuration_falcon.py @@ -12,7 +12,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -""" Falcon configuration""" +"""Falcon configuration""" + from ...configuration_utils import PretrainedConfig from ...utils import logging @@ -87,6 +88,11 @@ class FalconConfig(PretrainedConfig): The id of the "beginning-of-sequence" token. eos_token_id (`int`, *optional*, defaults to 11): The id of the "end-of-sequence" token. + ffn_hidden_size (`int`, *optional*): + The hidden size of the feedforward layer in the Transformer decoder. + defaults to 4x hidden dim + activation (`str`, *optional*, defaults to `"gelu"`): + The activation function used in the feedforward layer. Example: @@ -128,6 +134,8 @@ class FalconConfig(PretrainedConfig): rope_scaling=None, bos_token_id=11, eos_token_id=11, + ffn_hidden_size=None, + activation="gelu", **kwargs, ): self.vocab_size = vocab_size @@ -141,7 +149,6 @@ class FalconConfig(PretrainedConfig): self.use_cache = use_cache self.hidden_dropout = hidden_dropout self.attention_dropout = attention_dropout - self.bos_token_id = bos_token_id self.eos_token_id = eos_token_id self.num_kv_heads = num_attention_heads if num_kv_heads is None else num_kv_heads @@ -153,6 +160,11 @@ class FalconConfig(PretrainedConfig): self.max_position_embeddings = max_position_embeddings self.rope_theta = rope_theta self.rope_scaling = rope_scaling + self.activation = activation + if ffn_hidden_size is None: + self.ffn_hidden_size = hidden_size * 4 + else: + self.ffn_hidden_size = ffn_hidden_size self._rope_scaling_validation() super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) diff --git a/src/transformers/models/falcon/modeling_falcon.py b/src/transformers/models/falcon/modeling_falcon.py index c2f3f91d5c..d9254bec0a 100644 --- a/src/transformers/models/falcon/modeling_falcon.py +++ b/src/transformers/models/falcon/modeling_falcon.py @@ -24,6 +24,7 @@ from torch import nn from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss from torch.nn import functional as F +from ...activations import get_activation from ...modeling_attn_mask_utils import ( AttentionMaskConverter, _prepare_4d_causal_attention_mask, @@ -739,9 +740,9 @@ class FalconMLP(nn.Module): super().__init__() hidden_size = config.hidden_size - self.dense_h_to_4h = FalconLinear(hidden_size, 4 * hidden_size, bias=config.bias) - self.act = nn.GELU() - self.dense_4h_to_h = FalconLinear(4 * hidden_size, hidden_size, bias=config.bias) + self.dense_h_to_4h = FalconLinear(hidden_size, config.ffn_hidden_size, bias=config.bias) + self.act = get_activation(config.activation) + self.dense_4h_to_h = FalconLinear(config.ffn_hidden_size, hidden_size, bias=config.bias) self.hidden_dropout = config.hidden_dropout def forward(self, x: torch.Tensor) -> torch.Tensor: From 0bd58f1ce0573c0e3269de4215a17d318add49b9 Mon Sep 17 00:00:00 2001 From: Steven Liu <59462357+stevhliu@users.noreply.github.com> Date: Thu, 11 Apr 2024 09:23:55 -0700 Subject: [PATCH 088/110] Docs PR template (#30171) remove maria :( --- .github/ISSUE_TEMPLATE/bug-report.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 1ec76462ac..ff47109690 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -46,7 +46,7 @@ body: - Big Model Inference: @SunMarc - quantization (bitsandbytes, autogpt): @SunMarc and @younesbelkada - Documentation: @stevhliu and @MKhalusova + Documentation: @stevhliu Model hub: From 2c66600c3fbbdcfc1610c5862c33bd19cf42859e Mon Sep 17 00:00:00 2001 From: Younes Belkada <49240599+younesbelkada@users.noreply.github.com> Date: Fri, 12 Apr 2024 10:01:28 +0200 Subject: [PATCH 089/110] ENH: [`CI`] Add new workflow to run slow tests of important models on push main if they are modified (#29235) * v1 * v1 * more changes * more models * add more markers * swtich to A10 * use cache * Update .github/workflows/push-important-models.yml * Update .github/workflows/push-important-models.yml * Update modeling_llama.py * test * test * another test * test * test * attempt to fix * fix * try automatic tagging * fix * alternative approach for collecting * fix * fix * fix * test * fix * fix * test * revert some changes * fix * fix * fix * final push * fix * revert * test new slack message * oops * Update send-slack.yml * test * test re-usable workflow in steps * Update action.yml * test * another test * test * another test * test * another test * another test (hopefully last one) * attempt to fix * allez * removing comma * test * another test * attempt * test * test * test push * test * test * another test * test * make it better * fix commas * valid json * test * another test * test * final push * test * final push * more customizable messages * test * push * oops * another test * another test * missing indentation * more tweaks * more tweaks * another test * another test * tests * final push * use global variables instead * Update .github/workflows/push-important-models.yml * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * commit to test all models * issue with arrays * another test * attempt to fix failing tests * Update .github/workflows/push-important-models.yml * add ssh * Update .github/workflows/push-important-models.yml * test * test * add install curl * attempt to fix * final fix * test * test * test * fix test * another test * add inherit secrets * push * revert unneeded changes * revert * add env variables * add pip freeze * revert change in gemma * Update .github/workflows/push-important-models.yml * fix mistral and mixtral * add pdb * fix mixtral tesst * fix * fix mistral ? * add fix gemma * fix mistral * fix * test * anoter test * fix * fix * fix mistral tests * fix them again * final fixes for mistral * fix padding right * fix whipser fa2 * fix * fix * fix gemma * test * fix llama * fix * fix * fix llama gemma * add class attribute * fix CI * clarify whisper * compute_capability * rename names in some comments * Add # fmt: skip * make style * Update tests/models/mistral/test_modeling_mistral.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * update * update * change branch * correct workflow * modify file * test * works * final test * another fix * install sudo * final fix * add `-y` * set to `main` * Update .github/actions/post-slack/action.yml Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * change title * fixup * add upload report * fix * revert to main * add empty lines + add comment --------- Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> Co-authored-by: ydshieh Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> --- .github/actions/post-slack/action.yml | 79 ++++++++++++ .github/workflows/push-important-models.yml | 136 ++++++++++++++++++++ utils/important_models.txt | 4 + 3 files changed, 219 insertions(+) create mode 100644 .github/actions/post-slack/action.yml create mode 100644 .github/workflows/push-important-models.yml create mode 100644 utils/important_models.txt diff --git a/.github/actions/post-slack/action.yml b/.github/actions/post-slack/action.yml new file mode 100644 index 0000000000..74075a4fed --- /dev/null +++ b/.github/actions/post-slack/action.yml @@ -0,0 +1,79 @@ +name: Send message to slack + +description: 'Send results to slack' +author: 'Hugging Face' +inputs: + slack_channel: + required: true + type: string + title: + required: true + type: string + status: + required: true + type: string + slack_token: + required: true + type: string + +runs: + using: "composite" + steps: + - name: Create content to post + id: create-message + run: | + if [ "${{ inputs.status }}" == "success" ]; then + echo STATUS_MESSAGE='🟢 Tests are passing!' >> $GITHUB_ENV + else + echo STATUS_MESSAGE='🔴 Tests failed! Please check the GitHub action link below' >> $GITHUB_ENV + fi + shell: bash + + - name: Post Canceled results Slack channel + id: post-slack + uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + channel-id: ${{ inputs.slack_channel }} + # For posting a rich message using Block Kit + payload: | + { + "text": "${{ inputs.title }}", + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "${{ inputs.title }}" + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "${{ env.STATUS_MESSAGE }}" + } + }, + { + "type": "section", + "text": {"type": "mrkdwn", "text": "*Click the button for more details about the commit*"}, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Commit results"}, + "url": "${{ github.event.pull_request.html_url || github.event.head_commit.url }}" + } + }, + { + "type": "section", + "text": {"type": "mrkdwn", "text": "*Click here for more details about the action ran*"}, + "accessory": { + "type": "button", + "text": {"type": "plain_text", "text": "Check Action results"}, + "url": "${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" + } + } + ] + } + env: + SLACK_BOT_TOKEN: ${{ inputs.slack_token }} \ No newline at end of file diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml new file mode 100644 index 0000000000..5eef6f40f8 --- /dev/null +++ b/.github/workflows/push-important-models.yml @@ -0,0 +1,136 @@ +name: Slow tests on important models (on Push - A10) + +on: + push: + branches: [ main ] + +env: + IS_GITHUB_CI: "1" + OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA" + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + RUN_PT_TF_CROSS_TESTS: 1 + +jobs: + get_modified_models: + name: "Get all modified files" + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@3f54ebb830831fc121d3263c1857cfbdc310cdb9 #v42 + with: + files: src/transformers/models/** + + - name: Run step if only the files listed above change + if: steps.changed-files.outputs.any_changed == 'true' + id: set-matrix + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + model_arrays=() + for file in $ALL_CHANGED_FILES; do + model_path="${file#*models/}" + model_path="models/${model_path%%/*}" + if grep -qFx "$model_path" utils/important_models.txt; then + # Append the file to the matrix string + model_arrays+=("$model_path") + fi + done + matrix_string=$(printf '"%s", ' "${model_arrays[@]}" | sed 's/, $//') + echo "matrix=[$matrix_string]" >> $GITHUB_OUTPUT + test_modified_files: + needs: get_modified_models + name: Slow & FA2 tests + runs-on: [single-gpu, nvidia-gpu, a10, ci] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' }} + strategy: + fail-fast: false + matrix: + model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }} + + steps: + - name: Check out code + uses: actions/checkout@v3 + + - name: Install locally transformers & other libs + run: | + apt install sudo + sudo -H pip install --upgrade pip + sudo -H pip uninstall -y transformers + sudo -H pip install -U -e ".[testing]" + MAX_JOBS=4 pip install flash-attn --no-build-isolation + pip install bitsandbytes + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Run FA2 tests + id: run_fa2_tests + run: + pytest -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_* + + - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: ${{ matrix.model-name }}_fa2_tests + path: /transformers/reports/${{ matrix.model-name }}_fa2_tests + + - name: Post to Slack + if: always() + uses: ./.github/actions/post-slack + with: + slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} + title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }} + status: ${{ steps.run_fa2_tests.conclusion}} + slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }} + + - name: Run integration tests + id: run_integration_tests + if: always() + run: + pytest -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_* + + - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: tests_integration_${{ matrix.model-name }} + path: /transformers/reports/tests_integration_${{ matrix.model-name }} + + - name: Post to Slack + if: always() + uses: ./.github/actions/post-slack + with: + slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} + title: 🤗 Results of the Integration tests - ${{ matrix.model-name }} + status: ${{ steps.run_integration_tests.conclusion}} + slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }} + + - name: Tailscale # In order to be able to SSH when a test fails + if: ${{ failure() || runner.debug == '1'}} + uses: huggingface/tailscale-action@ssh-improvments + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + waitForSSH: true \ No newline at end of file diff --git a/utils/important_models.txt b/utils/important_models.txt new file mode 100644 index 0000000000..6dcd5de8d6 --- /dev/null +++ b/utils/important_models.txt @@ -0,0 +1,4 @@ +models/llama +models/mistral +models/mixtral +models/gemma \ No newline at end of file From c82b38a3e2c8af69fef08e371189766a87ed6159 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Fri, 12 Apr 2024 09:34:45 +0100 Subject: [PATCH 090/110] Fix pipeline logger.warning_once bug (#30195) Fix warning bug --- src/transformers/pipelines/base.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/pipelines/base.py b/src/transformers/pipelines/base.py index 7225a6136e..35ee02cab7 100644 --- a/src/transformers/pipelines/base.py +++ b/src/transformers/pipelines/base.py @@ -1181,7 +1181,6 @@ class Pipeline(_ScikitCompat): logger.warning_once( "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a" " dataset", - UserWarning, ) is_dataset = Dataset is not None and isinstance(inputs, Dataset) From caa5c65db1f4db617cdac2ad667ba62edf94dd98 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Fri, 12 Apr 2024 14:51:24 +0530 Subject: [PATCH 091/110] fix: Replaced deprecated `logger.warn` with `logger.warning` (#30197) * Fixed deprecated logger.warn by using logger.warning * Reformatted using ruff. --- src/transformers/modeling_utils.py | 4 ++-- .../tokenization_fastspeech2_conformer.py | 4 ++-- src/transformers/models/whisper/generation_whisper.py | 10 ++++++---- src/transformers/quantizers/quantizer_aqlm.py | 2 +- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py index 8a107694a1..e4fcd2ebc1 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py @@ -2225,7 +2225,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) else: self.apply(partial(self._set_gradient_checkpointing, value=True)) - logger.warn( + logger.warning( "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)." "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model." ) @@ -2273,7 +2273,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix if not _is_using_old_format: self._set_gradient_checkpointing(enable=False) else: - logger.warn( + logger.warning( "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)." "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model." ) diff --git a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py index bc52006ad6..5b979c8761 100644 --- a/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py +++ b/src/transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py @@ -130,14 +130,14 @@ class FastSpeech2ConformerTokenizer(PreTrainedTokenizer): # Override since phonemes cannot be converted back to strings def decode(self, token_ids, **kwargs): - logger.warn( + logger.warning( "Phonemes cannot be reliably converted to a string due to the one-many mapping, converting to tokens instead." ) return self.convert_ids_to_tokens(token_ids) # Override since phonemes cannot be converted back to strings def convert_tokens_to_string(self, tokens, **kwargs): - logger.warn( + logger.warning( "Phonemes cannot be reliably converted to a string due to the one-many mapping, returning the tokens." ) return tokens diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index bd88b67bc6..090e608e02 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -985,16 +985,18 @@ class WhisperGenerationMixin: "{}, but will be ignored." ) if condition_on_prev_tokens is not None: - logger.warn(warning_prefix.format(f"condition_on_prev_tokens is set to {condition_on_prev_tokens}")) + logger.warning(warning_prefix.format(f"condition_on_prev_tokens is set to {condition_on_prev_tokens}")) if compression_ratio_threshold is not None: - logger.warn(warning_prefix.format(f"compression_ratio_threshold is set to {compression_ratio_threshold}")) + logger.warning( + warning_prefix.format(f"compression_ratio_threshold is set to {compression_ratio_threshold}") + ) if logprob_threshold is not None: - logger.warn(warning_prefix.format(f"logprob_threshold is set to {logprob_threshold}")) + logger.warning(warning_prefix.format(f"logprob_threshold is set to {logprob_threshold}")) if no_speech_threshold is not None: - logger.warn(warning_prefix.format(f"no_speech_threshold is set to {no_speech_threshold}")) + logger.warning(warning_prefix.format(f"no_speech_threshold is set to {no_speech_threshold}")) # when passing temperature as a list it cannot just be ignored => throw error in this case if isinstance(temperature, (list, tuple)): diff --git a/src/transformers/quantizers/quantizer_aqlm.py b/src/transformers/quantizers/quantizer_aqlm.py index c2366305b6..5300716161 100644 --- a/src/transformers/quantizers/quantizer_aqlm.py +++ b/src/transformers/quantizers/quantizer_aqlm.py @@ -88,7 +88,7 @@ class AqlmHfQuantizer(HfQuantizer): if aqlm_supports_training: return True else: - logger.warn( + logger.warning( f"Currently installed `aqlm` version ({importlib.metadata.version('aqlm')}) doesn't support training. If you wish to train a quantized model, please update `aqlm` with `pip install aqlm>=1.0.2`" ) return False From ac33aeeeee2a7a89b89c93c2962e6feb90daef0a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C3=ABl=20Defferrard?= Date: Fri, 12 Apr 2024 16:41:35 +0200 Subject: [PATCH 092/110] fix typo (#30220) --- src/transformers/trainer_callback.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index 1e3b0e587a..f5bbcdbd42 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -196,7 +196,7 @@ class TrainerCallback: train_dataloader (`torch.utils.data.DataLoader`, *optional*): The current dataloader used for training. eval_dataloader (`torch.utils.data.DataLoader`, *optional*): - The current dataloader used for training. + The current dataloader used for evaluation. metrics (`Dict[str, float]`): The metrics computed by the last evaluation phase. From 65657d5d8adbd61fc44f3ca7e626096cb6931863 Mon Sep 17 00:00:00 2001 From: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:45:15 +0200 Subject: [PATCH 093/110] fix fuyu doctest (#30215) * fix doctest * fix example * fix * fix --------- Co-authored-by: ydshieh --- src/transformers/models/fuyu/modeling_fuyu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/transformers/models/fuyu/modeling_fuyu.py b/src/transformers/models/fuyu/modeling_fuyu.py index f94bac569f..8e9a41954a 100644 --- a/src/transformers/models/fuyu/modeling_fuyu.py +++ b/src/transformers/models/fuyu/modeling_fuyu.py @@ -242,17 +242,17 @@ class FuyuForCausalLM(FuyuPreTrainedModel): >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b") >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b") - >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png" >>> image = Image.open(requests.get(url, stream=True).raw) >>> prompt = "Generate a coco-style caption.\n" >>> inputs = processor(text=prompt, images=image, return_tensors="pt") >>> outputs = model(**inputs) - >>> generated_ids = model.generate(**model_inputs, max_new_tokens=7) - >>> generation_text = processor.batch_decode(generated_ids, skip_special_tokens=True) - >>> print(generation_text) - 'A bus parked on the side of a road.' + >>> generated_ids = model.generate(**inputs, max_new_tokens=7) + >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True) + >>> print(generation_text[0]) + A blue bus parked on the side of a road. ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions From bf9a7ab9321fc606db8a2d5134d1856a5798a611 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 12 Apr 2024 17:53:25 +0200 Subject: [PATCH 094/110] Fix `RecurrentGemmaIntegrationTest.test_2b_sample` (#30222) fix Co-authored-by: ydshieh --- tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py index ae1d9e7079..c46718b680 100644 --- a/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py +++ b/tests/models/recurrent_gemma/test_modeling_recurrent_gemma.py @@ -462,7 +462,7 @@ class RecurrentGemmaIntegrationTest(unittest.TestCase): @require_read_token def test_2b_sample(self): set_seed(0) - EXPECTED_TEXT = ['Where is Paris ?\n\nChoose the word or phrase that is closest in meaning to the word in capital letters.\n\nREDEEM\n(A) sort out\n(B) think over\n(C) turn in\n(D) take back\n\nWrite the correct word in the space next to each definition. Use each word only once.\n\nto badly damage\n\nOn the lines provided below, write P if the underlined word group is a phrase and NP if it is not a phrase. Example $\\underline{\\text{P}}$ 1. We have finally discovered the secret $\\underline{\\text{of delicious pizza. }}$'] # fmt: skip + EXPECTED_TEXT = ['Where is Paris ?\n\nAnswer this question "yes" or "no": Could a person pass out in subzero temperatures?\n\nFor the sentence below, underline the pronoun in parentheses that agrees with its antecedent.\n\nExample 1. Mary and Pam will have the opportunity to prove (herself, $\\underline{\\text{themselves}}$) at the concert.\n\nThe waiters and the manager at the restaurant will do (his, their) best to assist you.\n\nA vocabulary word appears in italics in the short passage below. Think about how the word is used. Then write a definition for the vocabulary word.\n\nAfter a one-hour $'] # fmt: skip model = AutoModelForCausalLM.from_pretrained(self.model_id).to(torch_device) tokenizer = AutoTokenizer.from_pretrained(self.model_id) From 4f7b434acbebd8ef552615ea6c81df4bd9b57813 Mon Sep 17 00:00:00 2001 From: Sergei Belousov Date: Fri, 12 Apr 2024 19:03:38 +0300 Subject: [PATCH 095/110] Update modeling_bark.py (#30221) Change .view() to .reshape() to prevent errors on non-contiguous tensors --- src/transformers/models/bark/modeling_bark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/bark/modeling_bark.py b/src/transformers/models/bark/modeling_bark.py index de04614075..a40ce79410 100644 --- a/src/transformers/models/bark/modeling_bark.py +++ b/src/transformers/models/bark/modeling_bark.py @@ -1068,7 +1068,7 @@ class BarkCoarseModel(BarkCausalModel): x_coarse_history[n, :] += codebook_size * n # flatten x_coarse_history - x_coarse_history = torch.transpose(x_coarse_history, 0, 1).view(-1) + x_coarse_history = torch.transpose(x_coarse_history, 0, 1).reshape(-1) x_coarse_history = x_coarse_history + semantic_generation_config.semantic_vocab_size From db7d1554444ccde72903c81160c8b45978ea018a Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Fri, 12 Apr 2024 18:59:45 +0200 Subject: [PATCH 096/110] Fix/Update for doctest (#30216) fix Co-authored-by: ydshieh --- src/transformers/models/ctrl/modeling_ctrl.py | 2 +- src/transformers/models/deit/modeling_deit.py | 2 +- src/transformers/models/deta/modeling_deta.py | 9 +++++---- .../gptsan_japanese/tokenization_gptsan_japanese.py | 6 +++--- src/transformers/models/vit_msn/modeling_vit_msn.py | 2 +- src/transformers/models/whisper/generation_whisper.py | 4 ++-- src/transformers/models/yolos/modeling_yolos.py | 10 +++++----- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/src/transformers/models/ctrl/modeling_ctrl.py b/src/transformers/models/ctrl/modeling_ctrl.py index 250ec8fc92..7534a0e50c 100644 --- a/src/transformers/models/ctrl/modeling_ctrl.py +++ b/src/transformers/models/ctrl/modeling_ctrl.py @@ -724,7 +724,7 @@ class CTRLForSequenceClassification(CTRLPreTrainedModel): >>> labels = torch.tensor(1) >>> loss = model(**inputs, labels=labels).loss >>> round(loss.item(), 2) - 0.35 + 0.93 ``` Example of multi-label classification: diff --git a/src/transformers/models/deit/modeling_deit.py b/src/transformers/models/deit/modeling_deit.py index d8f904b938..5efcc95d50 100644 --- a/src/transformers/models/deit/modeling_deit.py +++ b/src/transformers/models/deit/modeling_deit.py @@ -732,7 +732,7 @@ class DeiTForImageClassification(DeiTPreTrainedModel): >>> # model predicts one of the 1000 ImageNet classes >>> predicted_class_idx = logits.argmax(-1).item() >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) - Predicted class: magpie + Predicted class: Polaroid camera, Polaroid Land camera ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/deta/modeling_deta.py b/src/transformers/models/deta/modeling_deta.py index e849135559..ce0a5e79aa 100644 --- a/src/transformers/models/deta/modeling_deta.py +++ b/src/transformers/models/deta/modeling_deta.py @@ -1995,10 +1995,11 @@ class DetaForObjectDetection(DetaPreTrainedModel): ... f"Detected {model.config.id2label[label.item()]} with confidence " ... f"{round(score.item(), 3)} at location {box}" ... ) - Detected cat with confidence 0.683 at location [345.85, 23.68, 639.86, 372.83] - Detected cat with confidence 0.683 at location [8.8, 52.49, 316.93, 473.45] - Detected remote with confidence 0.568 at location [40.02, 73.75, 175.96, 117.33] - Detected remote with confidence 0.546 at location [333.68, 77.13, 370.12, 187.51] + Detected cat with confidence 0.802 at location [9.87, 54.36, 316.93, 473.44] + Detected cat with confidence 0.795 at location [346.62, 24.35, 639.62, 373.2] + Detected remote with confidence 0.725 at location [40.41, 73.36, 175.77, 117.29] + Detected remote with confidence 0.638 at location [333.34, 76.81, 370.22, 187.94] + Detected couch with confidence 0.584 at location [0.03, 0.99, 640.02, 474.93] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py index 2a2b465d8c..f9b6d7fb58 100644 --- a/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py +++ b/src/transformers/models/gptsan_japanese/tokenization_gptsan_japanese.py @@ -106,15 +106,15 @@ class GPTSanJapaneseTokenizer(PreTrainedTokenizer): >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese") >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"] - [[35993, 8640, 25948, 35998, 30647, 35675, 35999, 35999], [35993, 10382, 9868, 35998, 30646, 9459, 30646, 35675]] + [[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]] >>> # Mask for Prefix-LM inputs >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"] - [[1, 1, 1, 0, 0, 0, 0, 0], [1, 1, 1, 0, 0, 0, 0, 0]] + [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]] >>> # Mask for padding >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"] - [[1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1]] + [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]] ``` Args: diff --git a/src/transformers/models/vit_msn/modeling_vit_msn.py b/src/transformers/models/vit_msn/modeling_vit_msn.py index 45d1779b5f..9c2269a3ae 100644 --- a/src/transformers/models/vit_msn/modeling_vit_msn.py +++ b/src/transformers/models/vit_msn/modeling_vit_msn.py @@ -636,7 +636,7 @@ class ViTMSNForImageClassification(ViTMSNPreTrainedModel): >>> # model predicts one of the 1000 ImageNet classes >>> predicted_label = logits.argmax(-1).item() >>> print(model.config.id2label[predicted_label]) - Kerry blue terrier + tusker ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict diff --git a/src/transformers/models/whisper/generation_whisper.py b/src/transformers/models/whisper/generation_whisper.py index 090e608e02..b3865140f2 100644 --- a/src/transformers/models/whisper/generation_whisper.py +++ b/src/transformers/models/whisper/generation_whisper.py @@ -422,7 +422,7 @@ class WhisperGenerationMixin: >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en") >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en") - >>> model.cuda() + >>> model.cuda() # doctest: +IGNORE_RESULT >>> # load audios > 30 seconds >>> ds = load_dataset("distil-whisper/meanwhile", "default")["test"] @@ -441,7 +441,7 @@ class WhisperGenerationMixin: >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True) >>> transcription[0] - ' Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile!' + " Folks, if you watch the show, you know, I spent a lot of time right over there. Patiently and astutely scrutinizing the boxwood and mahogany chest set of the day's biggest stories developing the central headline pawns, definitely maneuvering an oso topical night to F6, fainting a classic Sicilian, nade door variation on the news, all the while seeing eight moves deep and patiently marshalling the latest press releases into a fisher's shows in Lip Nitsky attack that culminates in the elegant lethal slow-played, all-passant checkmate that is my nightly monologue. But sometimes, sometimes, folks, I. CHEERING AND APPLAUSE Sometimes I startle away, cubside down in the monkey bars of a condemned playground on a super fun site. Get all hept up on goofballs. Rummage that were discarded tag bag of defective toys. Yank out a fist bowl of disembodied doll limbs, toss them on a stained kid's place mat from a defunct dennies. set up a table inside a rusty cargo container down by the Wharf and challenged toothless drifters to the godless bughouse blitz of tournament that is my segment. Meanwhile." ``` - *Shortform transcription*: If passed mel input features are < 30 seconds, the whole audio will be transcribed with a single call to generate. diff --git a/src/transformers/models/yolos/modeling_yolos.py b/src/transformers/models/yolos/modeling_yolos.py index 864be38a7d..f47b6b228f 100755 --- a/src/transformers/models/yolos/modeling_yolos.py +++ b/src/transformers/models/yolos/modeling_yolos.py @@ -769,11 +769,11 @@ class YolosForObjectDetection(YolosPreTrainedModel): ... f"Detected {model.config.id2label[label.item()]} with confidence " ... f"{round(score.item(), 3)} at location {box}" ... ) - Detected remote with confidence 0.994 at location [46.96, 72.61, 181.02, 119.73] - Detected remote with confidence 0.975 at location [340.66, 79.19, 372.59, 192.65] - Detected cat with confidence 0.984 at location [12.27, 54.25, 319.42, 470.99] - Detected remote with confidence 0.922 at location [41.66, 71.96, 178.7, 120.33] - Detected cat with confidence 0.914 at location [342.34, 21.48, 638.64, 372.46] + Detected remote with confidence 0.991 at location [46.48, 72.78, 178.98, 119.3] + Detected remote with confidence 0.908 at location [336.48, 79.27, 368.23, 192.36] + Detected cat with confidence 0.934 at location [337.18, 18.06, 638.14, 373.09] + Detected cat with confidence 0.979 at location [10.93, 53.74, 313.41, 470.67] + Detected remote with confidence 0.974 at location [41.63, 72.23, 178.09, 119.99] ```""" return_dict = return_dict if return_dict is not None else self.config.use_return_dict From b109257f4fb8b1166e7c53cc5418632014ed53a5 Mon Sep 17 00:00:00 2001 From: ulatekh Date: Fri, 12 Apr 2024 10:03:49 -0700 Subject: [PATCH 097/110] Fixed config.json download to go to user-supplied cache directory (#30189) * Fixed config.json download to go to user-supplied cache directory. * Simplied implementation suggested by @amyeroberts --- src/transformers/pipelines/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index 8ee0137a20..e60035dd57 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -782,6 +782,7 @@ def pipeline( _raise_exceptions_for_gated_repo=False, _raise_exceptions_for_missing_entries=False, _raise_exceptions_for_connection_errors=False, + cache_dir=model_kwargs.get("cache_dir"), **hub_kwargs, ) hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash) From 8fd2de933cfeb514f4cb26f5041d4fd0ab323671 Mon Sep 17 00:00:00 2001 From: Xu Song Date: Mon, 15 Apr 2024 16:34:36 +0800 Subject: [PATCH 098/110] Add test for parse_json_file and change typing to os.PathLike (#30183) * Add test for parse_json_file * Change Path to PathLike * Fix `Import block is un-sorted or un-formatted` * revert parse_json_file * Fix ruff format * Add parse_json_file test --- src/transformers/hf_argparser.py | 9 +++++++-- tests/utils/test_hf_argparser.py | 2 +- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/src/transformers/hf_argparser.py b/src/transformers/hf_argparser.py index 196035a4d1..045bf79805 100644 --- a/src/transformers/hf_argparser.py +++ b/src/transformers/hf_argparser.py @@ -14,6 +14,7 @@ import dataclasses import json +import os import sys import types from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, ArgumentTypeError @@ -376,7 +377,9 @@ class HfArgumentParser(ArgumentParser): raise ValueError(f"Some keys are not used by the HfArgumentParser: {sorted(unused_keys)}") return tuple(outputs) - def parse_json_file(self, json_file: Union[str, Path], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: + def parse_json_file( + self, json_file: Union[str, os.PathLike], allow_extra_keys: bool = False + ) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a json file and populating the dataclass types. @@ -398,7 +401,9 @@ class HfArgumentParser(ArgumentParser): outputs = self.parse_dict(data, allow_extra_keys=allow_extra_keys) return tuple(outputs) - def parse_yaml_file(self, yaml_file: Union[str, Path], allow_extra_keys: bool = False) -> Tuple[DataClass, ...]: + def parse_yaml_file( + self, yaml_file: Union[str, os.PathLike], allow_extra_keys: bool = False + ) -> Tuple[DataClass, ...]: """ Alternative helper method that does not use `argparse` at all, instead loading a yaml file and populating the dataclass types. diff --git a/tests/utils/test_hf_argparser.py b/tests/utils/test_hf_argparser.py index c0fa748cbf..91b0c199ee 100644 --- a/tests/utils/test_hf_argparser.py +++ b/tests/utils/test_hf_argparser.py @@ -379,7 +379,7 @@ class HfArgumentParserTest(unittest.TestCase): os.mkdir(temp_local_path) with open(temp_local_path + ".json", "w+") as f: json.dump(args_dict_for_json, f) - parsed_args = parser.parse_yaml_file(Path(temp_local_path + ".json"))[0] + parsed_args = parser.parse_json_file(Path(temp_local_path + ".json"))[0] args = BasicExample(**args_dict_for_json) self.assertEqual(parsed_args, args) From 06b1192768220b77d8f5a22031ed081e79df1616 Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 15 Apr 2024 14:06:06 +0530 Subject: [PATCH 099/110] fix: Replace deprecated `assertEquals` with `assertEqual` (#30241) Replace deprecated assertEquals with assertEqual. --- ...xtraction_audio_spectrogram_transformer.py | 2 +- .../test_feature_extraction_encodec.py | 20 +++++++++---------- tests/models/git/test_modeling_git.py | 6 +++--- .../test_image_processing_mask2former.py | 12 +++++------ .../test_image_processing_maskformer.py | 12 +++++------ .../rembert/test_tokenization_rembert.py | 4 ++-- .../test_feature_extraction_speech_to_text.py | 2 +- .../test_feature_extraction_speecht5.py | 4 ++-- .../tvlt/test_feature_extraction_tvlt.py | 2 +- tests/models/udop/test_modeling_udop.py | 2 +- tests/test_modeling_flax_common.py | 2 +- tests/test_tokenization_common.py | 4 ++-- 12 files changed, 36 insertions(+), 36 deletions(-) diff --git a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py index ac6cd5eb1f..fbe2509086 100644 --- a/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py +++ b/tests/models/audio_spectrogram_transformer/test_feature_extraction_audio_spectrogram_transformer.py @@ -173,7 +173,7 @@ class ASTFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Test input_speech = self._load_datasamples(1) feature_extractor = ASTFeatureExtractor() input_values = feature_extractor(input_speech, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 1024, 128)) + self.assertEqual(input_values.shape, (1, 1024, 128)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) def test_feat_extract_from_and_save_pretrained(self): diff --git a/tests/models/encodec/test_feature_extraction_encodec.py b/tests/models/encodec/test_feature_extraction_encodec.py index 5a8010d247..e56517ac41 100644 --- a/tests/models/encodec/test_feature_extraction_encodec.py +++ b/tests/models/encodec/test_feature_extraction_encodec.py @@ -158,7 +158,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. input_audio = self._load_datasamples(1) feature_extractor = EncodecFeatureExtractor() input_values = feature_extractor(input_audio, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 1, 93680)) + self.assertEqual(input_values.shape, (1, 1, 93680)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6)) def test_integration_stereo(self): @@ -177,7 +177,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. input_audio[0][1] *= 0.5 feature_extractor = EncodecFeatureExtractor(feature_size=2) input_values = feature_extractor(input_audio, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 2, 93680)) + self.assertEqual(input_values.shape, (1, 2, 93680)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-6)) self.assertTrue(torch.allclose(input_values[0, 1, :30], EXPECTED_INPUT_VALUES * 0.5, atol=1e-6)) @@ -197,27 +197,27 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. # truncate to chunk truncated_outputs = feature_extractor(input_audio, truncation=True, return_tensors="pt").input_values - self.assertEquals(truncated_outputs.shape, (2, 1, 71520)) # 2 chunks + self.assertEqual(truncated_outputs.shape, (2, 1, 71520)) # 2 chunks # force truncate to max_length truncated_outputs = feature_extractor( input_audio, truncation=True, max_length=48000, return_tensors="pt" ).input_values - self.assertEquals(truncated_outputs.shape, (2, 1, 48000)) + self.assertEqual(truncated_outputs.shape, (2, 1, 48000)) # pad to chunk padded_outputs = feature_extractor(input_audio, padding=True, return_tensors="pt").input_values - self.assertEquals(padded_outputs.shape, (2, 1, 95280)) + self.assertEqual(padded_outputs.shape, (2, 1, 95280)) # pad to chunk truncated_outputs = feature_extractor(input_audio, return_tensors="pt").input_values - self.assertEquals(truncated_outputs.shape, (2, 1, 95280)) + self.assertEqual(truncated_outputs.shape, (2, 1, 95280)) # force pad to max length truncated_outputs = feature_extractor( input_audio, padding="max_length", max_length=100000, return_tensors="pt" ).input_values - self.assertEquals(truncated_outputs.shape, (2, 1, 100000)) + self.assertEqual(truncated_outputs.shape, (2, 1, 100000)) # force no pad with self.assertRaisesRegex( @@ -227,7 +227,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values - self.assertEquals(truncated_outputs.shape, (1, 1, 93680)) + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) # no pad if no chunk_length_s feature_extractor.chunk_length_s = None @@ -238,7 +238,7 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values - self.assertEquals(truncated_outputs.shape, (1, 1, 93680)) + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) # no pad if no overlap feature_extractor.chunk_length_s = 2 @@ -250,4 +250,4 @@ class EnCodecFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest. truncated_outputs = feature_extractor(input_audio, padding=False, return_tensors="pt").input_values truncated_outputs = feature_extractor(input_audio[0], padding=False, return_tensors="pt").input_values - self.assertEquals(truncated_outputs.shape, (1, 1, 93680)) + self.assertEqual(truncated_outputs.shape, (1, 1, 93680)) diff --git a/tests/models/git/test_modeling_git.py b/tests/models/git/test_modeling_git.py index 6a891f17b0..0ef74d8184 100644 --- a/tests/models/git/test_modeling_git.py +++ b/tests/models/git/test_modeling_git.py @@ -510,7 +510,7 @@ class GitModelIntegrationTest(unittest.TestCase): expected_shape = torch.Size((1, 9)) self.assertEqual(outputs.sequences.shape, expected_shape) - self.assertEquals(generated_caption, "two cats laying on a pink blanket") + self.assertEqual(generated_caption, "two cats laying on a pink blanket") self.assertTrue(outputs.scores[-1].shape, expected_shape) expected_slice = torch.tensor([[-0.8805, -0.8803, -0.8799]], device=torch_device) self.assertTrue(torch.allclose(outputs.scores[-1][0, :3], expected_slice, atol=1e-4)) @@ -537,7 +537,7 @@ class GitModelIntegrationTest(unittest.TestCase): expected_shape = torch.Size((1, 15)) self.assertEqual(generated_ids.shape, expected_shape) - self.assertEquals(generated_caption, "what does the front of the bus say at the top? special") + self.assertEqual(generated_caption, "what does the front of the bus say at the top? special") def test_batched_generation(self): processor = GitProcessor.from_pretrained("microsoft/git-base-coco") @@ -555,4 +555,4 @@ class GitModelIntegrationTest(unittest.TestCase): generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50) generated_captions = processor.batch_decode(generated_ids, skip_special_tokens=True) - self.assertEquals(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2) + self.assertEqual(generated_captions, ["two cats sleeping on a pink blanket next to remotes."] * 2) diff --git a/tests/models/mask2former/test_image_processing_mask2former.py b/tests/models/mask2former/test_image_processing_mask2former.py index 24d5b8cf89..9b9e46907b 100644 --- a/tests/models/mask2former/test_image_processing_mask2former.py +++ b/tests/models/mask2former/test_image_processing_mask2former.py @@ -297,8 +297,8 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512)) self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 41527.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 26259.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 41527.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 26259.0) def test_integration_semantic_segmentation(self): # load 2 images and corresponding semantic annotations from the hub @@ -339,8 +339,8 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512)) self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 170200.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 257036.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 170200.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 257036.0) def test_integration_panoptic_segmentation(self): # load 2 images and corresponding panoptic annotations from the hub @@ -400,8 +400,8 @@ class Mask2FormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711)) self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 315193.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 350747.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0) def test_binary_mask_to_rle(self): fake_binary_mask = np.zeros((20, 50)) diff --git a/tests/models/maskformer/test_image_processing_maskformer.py b/tests/models/maskformer/test_image_processing_maskformer.py index e4779f896a..dcb0a04f57 100644 --- a/tests/models/maskformer/test_image_processing_maskformer.py +++ b/tests/models/maskformer/test_image_processing_maskformer.py @@ -297,8 +297,8 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (2, 512, 512)) self.assertEqual(inputs["mask_labels"][1].shape, (4, 512, 512)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 41527.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 26259.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 41527.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 26259.0) def test_integration_semantic_segmentation(self): # load 2 images and corresponding semantic annotations from the hub @@ -339,8 +339,8 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (3, 512, 512)) self.assertEqual(inputs["mask_labels"][1].shape, (8, 512, 512)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 170200.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 257036.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 170200.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 257036.0) def test_integration_panoptic_segmentation(self): # load 2 images and corresponding panoptic annotations from the hub @@ -400,8 +400,8 @@ class MaskFormerImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase) self.assertEqual(len(inputs["mask_labels"]), 2) self.assertEqual(inputs["mask_labels"][0].shape, (79, 512, 711)) self.assertEqual(inputs["mask_labels"][1].shape, (61, 512, 711)) - self.assertEquals(inputs["mask_labels"][0].sum().item(), 315193.0) - self.assertEquals(inputs["mask_labels"][1].sum().item(), 350747.0) + self.assertEqual(inputs["mask_labels"][0].sum().item(), 315193.0) + self.assertEqual(inputs["mask_labels"][1].sum().item(), 350747.0) def test_binary_mask_to_rle(self): fake_binary_mask = np.zeros((20, 50)) diff --git a/tests/models/rembert/test_tokenization_rembert.py b/tests/models/rembert/test_tokenization_rembert.py index 5f65629213..096106a2fc 100644 --- a/tests/models/rembert/test_tokenization_rembert.py +++ b/tests/models/rembert/test_tokenization_rembert.py @@ -88,13 +88,13 @@ class RemBertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): encoded_string = tokenizer.encode(text) self.assertListEqual(encoded_string, [1000, 7, 0, 1001]) decode_text = tokenizer.convert_tokens_to_string(tokens) - self.assertEquals(decode_text, text) + self.assertEqual(decode_text, text) text = "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈" tokens = tokenizer.tokenize(text) self.assertListEqual( tokens, ['▁That', "'", 's', '▁a', 'w', 'es', 'ome', '!', '▁', '🤩', '▁', '#', 'H', 'u', 'g', 'g', 'ing', 'F', 'a', 'ce', ',', '▁', '🌟', '▁H', 'a', 've', '▁a', '▁great', '▁day', '!', '▁', '🌈']) # fmt: skip decode_text = tokenizer.convert_tokens_to_string(tokens) - self.assertEquals(decode_text, "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈") + self.assertEqual(decode_text, "That's awesome! 🤩 #HuggingFace, 🌟 Have a great day! 🌈") text = "In the sky up above" tokens = tokenizer._tokenize(text) diff --git a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py index f652d09ffc..9023e8467f 100644 --- a/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py +++ b/tests/models/speech_to_text/test_feature_extraction_speech_to_text.py @@ -277,7 +277,7 @@ class Speech2TextFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unitt input_speech = self._load_datasamples(1) feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict()) input_features = feature_extractor(input_speech, return_tensors="pt").input_features - self.assertEquals(input_features.shape, (1, 584, 24)) + self.assertEqual(input_features.shape, (1, 584, 24)) self.assertTrue(np.allclose(input_features[0, 0, :30], expected, atol=1e-4)) def test_feat_extract_from_and_save_pretrained(self): diff --git a/tests/models/speecht5/test_feature_extraction_speecht5.py b/tests/models/speecht5/test_feature_extraction_speecht5.py index 22d99a8180..5ec632e7e7 100644 --- a/tests/models/speecht5/test_feature_extraction_speecht5.py +++ b/tests/models/speecht5/test_feature_extraction_speecht5.py @@ -401,7 +401,7 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest input_speech = self._load_datasamples(1) feature_extractor = SpeechT5FeatureExtractor() input_values = feature_extractor(input_speech, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 93680)) + self.assertEqual(input_values.shape, (1, 93680)) self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6)) def test_integration_target(self): @@ -417,5 +417,5 @@ class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest input_speech = self._load_datasamples(1) feature_extractor = SpeechT5FeatureExtractor() input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values - self.assertEquals(input_values.shape, (1, 366, 80)) + self.assertEqual(input_values.shape, (1, 366, 80)) self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4)) diff --git a/tests/models/tvlt/test_feature_extraction_tvlt.py b/tests/models/tvlt/test_feature_extraction_tvlt.py index e2d8c624b0..cd737d5a8f 100644 --- a/tests/models/tvlt/test_feature_extraction_tvlt.py +++ b/tests/models/tvlt/test_feature_extraction_tvlt.py @@ -176,7 +176,7 @@ class TvltFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.Tes feature_extractor = TvltFeatureExtractor() audio_values = feature_extractor(input_speech, return_tensors="pt").audio_values - self.assertEquals(audio_values.shape, (1, 1, 192, 128)) + self.assertEqual(audio_values.shape, (1, 1, 192, 128)) expected_slice = torch.tensor([[-0.3032, -0.2708], [-0.4434, -0.4007]]) self.assertTrue(torch.allclose(audio_values[0, 0, :2, :2], expected_slice, atol=1e-4)) diff --git a/tests/models/udop/test_modeling_udop.py b/tests/models/udop/test_modeling_udop.py index 257f6245ee..63e7a2fa78 100644 --- a/tests/models/udop/test_modeling_udop.py +++ b/tests/models/udop/test_modeling_udop.py @@ -574,4 +574,4 @@ class UdopModelIntegrationTests(unittest.TestCase): predicted_ids = model.generate(**encoding) predicted_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] - self.assertEquals(predicted_text, "2013") + self.assertEqual(predicted_text, "2013") diff --git a/tests/test_modeling_flax_common.py b/tests/test_modeling_flax_common.py index ef99786fdf..22d6b241f0 100644 --- a/tests/test_modeling_flax_common.py +++ b/tests/test_modeling_flax_common.py @@ -792,7 +792,7 @@ class FlaxModelTesterMixin: types = flatten_dict(types) for name, type_ in types.items(): - self.assertEquals(type_, jnp.float32, msg=f"param {name} is not initialized in fp32.") + self.assertEqual(type_, jnp.float32, msg=f"param {name} is not initialized in fp32.") def test_to_bf16(self): config, _ = self.model_tester.prepare_config_and_inputs_for_common() diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index e98f09d431..76402cd092 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1608,7 +1608,7 @@ class TokenizerTesterMixin: with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): slow_encode = slow_tokenizer.encode(string_to_check) fast_encode = rust_tokenizer.encode(string_to_check) - self.assertEquals( + self.assertEqual( slow_encode, fast_encode, "Hint: the following tokenization diff were obtained for slow vs fast:\n " @@ -1620,7 +1620,7 @@ class TokenizerTesterMixin: for chunk in range(0, len(input_full_vocab_ids) - 100, 100): ids_to_decode = input_full_vocab_ids[chunk : chunk + 100] with self.subTest(f"{(chunk/len(input_full_vocab_string))*100}%"): - self.assertEquals( + self.assertEqual( slow_tokenizer.decode( ids_to_decode, space_between_special_tokens=False, From f0107862189b461755bb0bd86a28424060cb2fd2 Mon Sep 17 00:00:00 2001 From: JINO ROHIT Date: Mon, 15 Apr 2024 16:09:10 +0530 Subject: [PATCH 100/110] Set pad_token in run_glue_no_trainer.py #28534 (#30234) --- examples/pytorch/text-classification/run_glue_no_trainer.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/pytorch/text-classification/run_glue_no_trainer.py b/examples/pytorch/text-classification/run_glue_no_trainer.py index 24c983145d..d679a6a63c 100644 --- a/examples/pytorch/text-classification/run_glue_no_trainer.py +++ b/examples/pytorch/text-classification/run_glue_no_trainer.py @@ -327,6 +327,9 @@ def main(): tokenizer = AutoTokenizer.from_pretrained( args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code ) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + config.pad_token_id = tokenizer.pad_token_id model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), From b3595cf02b9997e9acfc23ed6f4bcec78cd315ed Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 15 Apr 2024 16:48:37 +0530 Subject: [PATCH 101/110] fix: Replaced deprecated `typing.Text` with `str` (#30230) typing.Text is deprecated. Use str instead --- examples/research_projects/tapex/wikisql_utils.py | 4 ++-- .../models/tapas/tokenization_tapas.py | 14 +++++++------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/research_projects/tapex/wikisql_utils.py b/examples/research_projects/tapex/wikisql_utils.py index 110b14e02f..3351bddf01 100644 --- a/examples/research_projects/tapex/wikisql_utils.py +++ b/examples/research_projects/tapex/wikisql_utils.py @@ -21,7 +21,7 @@ import re # The following script is adapted from the script of TaPas. # Original: https://github.com/google-research/tapas/master/wikisql_utils.py -from typing import Any, List, Text +from typing import Any, List EMPTY_ANSWER = "none" @@ -114,7 +114,7 @@ class _Operator(enum.Enum): class _Condition: """Represents an SQL where clauses (e.g A = "a" or B > 5).""" - column: Text + column: str operator: _Operator cmp_value: Any diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 124d48df24..23fbd5300e 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -24,7 +24,7 @@ import os import re import unicodedata from dataclasses import dataclass -from typing import Callable, Dict, Generator, List, Optional, Text, Tuple, Union +from typing import Callable, Dict, Generator, List, Optional, Tuple, Union import numpy as np @@ -70,19 +70,19 @@ class TokenCoordinates: @dataclass class TokenizedTable: - rows: List[List[List[Text]]] + rows: List[List[List[str]]] selected_tokens: List[TokenCoordinates] @dataclass(frozen=True) class SerializedExample: - tokens: List[Text] + tokens: List[str] column_ids: List[int] row_ids: List[int] segment_ids: List[int] -def _is_inner_wordpiece(token: Text): +def _is_inner_wordpiece(token: str): return token.startswith("##") @@ -2224,14 +2224,14 @@ class NumericValueSpan: @dataclass class Cell: - text: Text + text: str numeric_value: Optional[NumericValue] = None @dataclass class Question: - original_text: Text # The original raw question string. - text: Text # The question string after normalization. + original_text: str # The original raw question string. + text: str # The question string after normalization. numeric_spans: Optional[List[NumericValueSpan]] = None From b6b6daf2b7d0726084bba60e21196989dd12f7f6 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 15 Apr 2024 13:20:36 +0200 Subject: [PATCH 102/110] Refactor doctest (#30210) * fix * update * fix * update * fix --------- Co-authored-by: ydshieh --- .github/workflows/doctest_job.yml | 81 +++++++++++++ .github/workflows/doctests.yml | 88 +++++++------- utils/notification_service_doc_tests.py | 152 +++++++++++------------- utils/split_doctest_jobs.py | 91 ++++++++++++++ utils/tests_fetcher.py | 3 + 5 files changed, 289 insertions(+), 126 deletions(-) create mode 100644 .github/workflows/doctest_job.yml create mode 100644 utils/split_doctest_jobs.py diff --git a/.github/workflows/doctest_job.yml b/.github/workflows/doctest_job.yml new file mode 100644 index 0000000000..3e923c1c8f --- /dev/null +++ b/.github/workflows/doctest_job.yml @@ -0,0 +1,81 @@ +name: Doctest job + +on: + workflow_call: + inputs: + job_splits: + required: true + type: string + split_keys: + required: true + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + RUN_SLOW: yes + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + +jobs: + run_doctests: + name: " " + strategy: + fail-fast: false + matrix: + split_keys: ${{ fromJson(inputs.split_keys) }} + runs-on: [single-gpu, nvidia-gpu, t4, ci] + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[flax] + + - name: GPU visibility + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Get doctest files + working-directory: /transformers + run: | + echo "${{ toJson(fromJson(inputs.job_splits)[matrix.split_keys]) }}" > doc_tests.txt + cat doc_tests.txt + + - name: Set `split_keys` + shell: bash + run: | + echo "${{ matrix.split_keys }}" + split_keys=${{ matrix.split_keys }} + split_keys=${split_keys//'/'/'_'} + echo "split_keys" + echo "split_keys=$split_keys" >> $GITHUB_ENV + + - name: Run doctests + working-directory: /transformers + run: | + cat doc_tests.txt + python3 -m pytest -v --make-reports doc_tests_gpu_${{ env.split_keys }} --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/doc_tests_gpu_${{ env.split_keys }}/failures_short.txt + + - name: "Test suite reports artifacts: doc_tests_gpu_test_reports_${{ env.split_keys }}" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: doc_tests_gpu_test_reports_${{ env.split_keys }} + path: /transformers/reports/doc_tests_gpu_${{ env.split_keys }} diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 0384144cea..014740685e 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -3,81 +3,85 @@ name: Doctests on: push: branches: - - doctest* + - run_doctest* repository_dispatch: schedule: - cron: "17 2 * * *" - env: - HF_HOME: /mnt/cache - TRANSFORMERS_IS_CI: yes - RUN_SLOW: yes - OMP_NUM_THREADS: 16 - MKL_NUM_THREADS: 16 - SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} - TF_FORCE_GPU_ALLOW_GROWTH: true + NUM_SLICES: 3 jobs: - run_doctests: + setup: + name: Setup runs-on: [single-gpu, nvidia-gpu, t4, ci] container: image: huggingface/transformers-all-latest-gpu options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + job_splits: ${{ steps.set-matrix.outputs.job_splits }} + split_keys: ${{ steps.set-matrix.outputs.split_keys }} steps: - - name: uninstall transformers (installed during docker image build) - run: python3 -m pip uninstall -y transformers - - - uses: actions/checkout@v3 - - name: NVIDIA-SMI + - name: Update clone + working-directory: /transformers run: | - nvidia-smi + git fetch && git checkout ${{ github.sha }} - - name: Install transformers in edit mode - run: python3 -m pip install -e .[flax] - - - name: GPU visibility - run: | - python3 utils/print_env.py + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . - name: Show installed libraries and their versions + working-directory: /transformers run: pip freeze - - name: Get doctest files + - name: Check values for matrix + working-directory: /transformers run: | - $(python3 -c 'from utils.tests_fetcher import get_all_doctest_files; to_test = get_all_doctest_files(); to_test = " ".join(to_test); fp = open("doc_tests.txt", "w"); fp.write(to_test); fp.close()') + python3 utils/split_doctest_jobs.py + python3 utils/split_doctest_jobs.py --only_return_keys --num_splits ${{ env.NUM_SLICES }} - - name: Run doctests + - id: set-matrix + working-directory: /transformers + name: Set values for matrix run: | - python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md" - - - name: Failure short reports - if: ${{ failure() }} - continue-on-error: true - run: cat reports/doc_tests_gpu/failures_short.txt - - - name: Test suite reports artifacts - if: ${{ always() }} - uses: actions/upload-artifact@v3 - with: - name: doc_tests_gpu_test_reports - path: reports/doc_tests_gpu + echo "job_splits=$(python3 utils/split_doctest_jobs.py)" >> $GITHUB_OUTPUT + echo "split_keys=$(python3 utils/split_doctest_jobs.py --only_return_keys --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + call_doctest_job: + name: "Call doctest jobs" + needs: setup + strategy: + fail-fast: false + matrix: + split_keys: ${{ fromJson(needs.setup.outputs.split_keys) }} + uses: ./.github/workflows/doctest_job.yml + with: + job_splits: ${{ needs.setup.outputs.job_splits }} + split_keys: ${{ toJson(matrix.split_keys) }} + secrets: inherit send_results: name: Send results to webhook runs-on: ubuntu-22.04 if: always() - needs: [run_doctests] + needs: [call_doctest_job] steps: - uses: actions/checkout@v3 - uses: actions/download-artifact@v3 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} - CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }} - CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }} - CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + # Use `CI_SLACK_CHANNEL_DUMMY_TESTS` when doing experimentation + SLACK_REPORT_CHANNEL: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY_DOCS }} run: | pip install slack_sdk python utils/notification_service_doc_tests.py + + - name: "Upload results" + if: ${{ always() }} + uses: actions/upload-artifact@v3 + with: + name: doc_test_results + path: doc_test_results \ No newline at end of file diff --git a/utils/notification_service_doc_tests.py b/utils/notification_service_doc_tests.py index c516963be1..d944f73b63 100644 --- a/utils/notification_service_doc_tests.py +++ b/utils/notification_service_doc_tests.py @@ -12,16 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -import collections import json -import math import os import re import time -from fnmatch import fnmatch from typing import Dict, List -import requests +from get_ci_error_statistics import get_jobs from slack_sdk import WebClient @@ -66,9 +63,8 @@ class Message: def __init__(self, title: str, doc_test_results: Dict): self.title = title - self._time_spent = doc_test_results["time_spent"].split(",")[0] - self.n_success = doc_test_results["success"] - self.n_failures = doc_test_results["failures"] + self.n_success = sum(job_result["n_success"] for job_result in doc_test_results.values()) + self.n_failures = sum(job_result["n_failures"] for job_result in doc_test_results.values()) self.n_tests = self.n_success + self.n_failures # Failures and success of the modeling tests @@ -76,7 +72,8 @@ class Message: @property def time(self) -> str: - time_spent = [self._time_spent] + all_results = [*self.doc_test_results.values()] + time_spent = [r["time_spent"].split(", ")[0] for r in all_results if len(r["time_spent"])] total_secs = 0 for time in time_spent: @@ -205,7 +202,7 @@ class Message: print(json.dumps({"blocks": json.loads(payload)})) client.chat_postMessage( - channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"], + channel=SLACK_REPORT_CHANNEL_ID, text="There was an issue running the tests.", blocks=payload, ) @@ -217,7 +214,7 @@ class Message: text = f"{self.n_failures} failures out of {self.n_tests} tests," if self.n_failures else "All tests passed." self.thread_ts = client.chat_postMessage( - channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"], + channel=SLACK_REPORT_CHANNEL_ID, blocks=self.payload, text=text, ) @@ -248,7 +245,7 @@ class Message: } return [ - {"type": "header", "text": {"type": "plain_text", "text": title.upper(), "emoji": True}}, + {"type": "header", "text": {"type": "plain_text", "text": title, "emoji": True}}, content, {"type": "section", "text": {"type": "mrkdwn", "text": failure_text}}, ] @@ -257,24 +254,19 @@ class Message: if self.thread_ts is None: raise ValueError("Can only post reply if a post has been made.") - job_link = self.doc_test_results.pop("job_link") - self.doc_test_results.pop("failures") - self.doc_test_results.pop("success") - self.doc_test_results.pop("time_spent") - sorted_dict = sorted(self.doc_test_results.items(), key=lambda t: t[0]) - for job, job_result in sorted_dict: - if len(job_result["failures"]): + for job_name, job_result in sorted_dict: + if len(job_result["failures"]) > 0: text = f"*Num failures* :{len(job_result['failed'])} \n" failures = job_result["failures"] - blocks = self.get_reply_blocks(job, job_link, failures, text=text) + blocks = self.get_reply_blocks(job_name, job_result["job_link"], failures, text=text) print("Sending the following reply") print(json.dumps({"blocks": blocks})) client.chat_postMessage( - channel=os.environ["CI_SLACK_CHANNEL_ID_DAILY"], - text=f"Results for {job}", + channel=SLACK_REPORT_CHANNEL_ID, + text=f"Results for {job_name}", blocks=blocks, thread_ts=self.thread_ts["ts"], ) @@ -282,27 +274,6 @@ class Message: time.sleep(1) -def get_job_links(): - run_id = os.environ["GITHUB_RUN_ID"] - url = f"https://api.github.com/repos/huggingface/transformers/actions/runs/{run_id}/jobs?per_page=100" - result = requests.get(url).json() - jobs = {} - - try: - jobs.update({job["name"]: job["html_url"] for job in result["jobs"]}) - pages_to_iterate_over = math.ceil((result["total_count"] - 100) / 100) - - for i in range(pages_to_iterate_over): - result = requests.get(url + f"&page={i + 2}").json() - jobs.update({job["name"]: job["html_url"] for job in result["jobs"]}) - - return jobs - except Exception as e: - print("Unknown error, could not fetch links.", e) - - return {} - - def retrieve_artifact(name: str): _artifact = {} @@ -344,57 +315,70 @@ def retrieve_available_artifacts(): if __name__ == "__main__": - github_actions_job_links = get_job_links() - available_artifacts = retrieve_available_artifacts() + SLACK_REPORT_CHANNEL_ID = os.environ["SLACK_REPORT_CHANNEL"] - docs = collections.OrderedDict( - [ - ("*.py", "API Examples"), - ("*.md", "MD Examples"), - ] + github_actions_jobs = get_jobs( + workflow_run_id=os.environ["GITHUB_RUN_ID"], token=os.environ["ACCESS_REPO_INFO_TOKEN"] ) - # This dict will contain all the information relative to each doc test category: - # - failed: list of failed tests - # - failures: dict in the format 'test': 'error_message' - doc_test_results = { - v: { - "failed": [], - "failures": {}, - } - for v in docs.values() - } + artifact_name_to_job_map = {} + for job in github_actions_jobs: + for step in job["steps"]: + if step["name"].startswith("Test suite reports artifacts: "): + artifact_name = step["name"][len("Test suite reports artifacts: ") :] + artifact_name_to_job_map[artifact_name] = job + break - # Link to the GitHub Action job - doc_test_results["job_link"] = github_actions_job_links.get("run_doctests") + available_artifacts = retrieve_available_artifacts() - artifact_path = available_artifacts["doc_tests_gpu_test_reports"].paths[0] - artifact = retrieve_artifact(artifact_path["name"]) - if "stats" in artifact: - failed, success, time_spent = handle_test_results(artifact["stats"]) - doc_test_results["failures"] = failed - doc_test_results["success"] = success - doc_test_results["time_spent"] = time_spent[1:-1] + ", " + doc_test_results = {} + # `artifact_key` is the artifact path + for artifact_key, artifact_obj in available_artifacts.items(): + artifact_path = artifact_obj.paths[0] + if not artifact_path["path"].startswith("doc_tests_gpu_test_reports_"): + continue - all_failures = extract_first_line_failure(artifact["failures_short"]) - for line in artifact["summary_short"].split("\n"): - if re.search("FAILED", line): - line = line.replace("FAILED ", "") - line = line.split()[0].replace("\n", "") + # change "_" back to "/" (to show the job name as path) + job_name = artifact_path["path"].replace("doc_tests_gpu_test_reports_", "").replace("_", "/") - if "::" in line: - file_path, test = line.split("::") - else: - file_path, test = line, line + # This dict (for each job) will contain all the information relative to each doc test job, in particular: + # - failed: list of failed tests + # - failures: dict in the format 'test': 'error_message' + job_result = {} + doc_test_results[job_name] = job_result - for file_regex in docs.keys(): - if fnmatch(file_path, file_regex): - category = docs[file_regex] - doc_test_results[category]["failed"].append(test) + job = artifact_name_to_job_map[artifact_path["path"]] + job_result["job_link"] = job["html_url"] + job_result["category"] = "Python Examples" if job_name.startswith("src/") else "MD Examples" - failure = all_failures[test] if test in all_failures else "N/A" - doc_test_results[category]["failures"][test] = failure - break + artifact = retrieve_artifact(artifact_path["path"]) + if "stats" in artifact: + failed, success, time_spent = handle_test_results(artifact["stats"]) + job_result["n_failures"] = failed + job_result["n_success"] = success + job_result["time_spent"] = time_spent[1:-1] + ", " + job_result["failed"] = [] + job_result["failures"] = {} + + all_failures = extract_first_line_failure(artifact["failures_short"]) + for line in artifact["summary_short"].split("\n"): + if re.search("FAILED", line): + line = line.replace("FAILED ", "") + line = line.split()[0].replace("\n", "") + + if "::" in line: + file_path, test = line.split("::") + else: + file_path, test = line, line + + job_result["failed"].append(test) + failure = all_failures[test] if test in all_failures else "N/A" + job_result["failures"][test] = failure + + # Save and to be uploaded as artifact + os.makedirs("doc_test_results", exist_ok=True) + with open("doc_test_results/doc_test_results.json", "w", encoding="UTF-8") as fp: + json.dump(doc_test_results, fp, ensure_ascii=False, indent=4) message = Message("🤗 Results of the doc tests.", doc_test_results) message.post() diff --git a/utils/split_doctest_jobs.py b/utils/split_doctest_jobs.py new file mode 100644 index 0000000000..0735298f31 --- /dev/null +++ b/utils/split_doctest_jobs.py @@ -0,0 +1,91 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script is used to get the files against which we will run doc testing. +This uses `tests_fetcher.get_all_doctest_files` then groups the test files by their directory paths. + +The files in `docs/source/en/model_doc` or `docs/source/en/tasks` are **NOT** grouped together with other files in the +same directory: the objective is to run doctest against them in independent GitHub Actions jobs. + +Assume we are under `transformers` root directory: +To get a map (dictionary) between directory (or file) paths and the corresponding files +```bash +python utils/split_doctest_jobs.py +``` +or to get a list of lists of directory (or file) paths +```bash +python utils/split_doctest_jobs.py --only_return_keys --num_splits 4 +``` +(this is used to allow GitHub Actions to generate more than 256 jobs using matrix) +""" + +import argparse +from collections import defaultdict +from pathlib import Path + +from tests_fetcher import get_all_doctest_files + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--only_return_keys", + action="store_true", + help="if to only return the keys (which is a list of list of files' directory or file paths).", + ) + parser.add_argument( + "--num_splits", + type=int, + default=1, + help="the number of splits into which the (flat) list of direcotry/file paths will be split. This has effect only if `only_return_keys` is `True`.", + ) + args = parser.parse_args() + + all_doctest_files = get_all_doctest_files() + + raw_test_collection_map = defaultdict(list) + + for file in all_doctest_files: + file_dir = "/".join(Path(file).parents[0].parts) + raw_test_collection_map[file_dir].append(file) + + refined_test_collection_map = {} + for file_dir in raw_test_collection_map.keys(): + if file_dir in ["docs/source/en/model_doc", "docs/source/en/tasks"]: + for file in raw_test_collection_map[file_dir]: + refined_test_collection_map[file] = file + else: + refined_test_collection_map[file_dir] = " ".join(sorted(raw_test_collection_map[file_dir])) + + sorted_file_dirs = sorted(refined_test_collection_map.keys()) + + test_collection_map = {} + for file_dir in sorted_file_dirs: + test_collection_map[file_dir] = refined_test_collection_map[file_dir] + + num_jobs = len(test_collection_map) + num_jobs_per_splits = num_jobs // args.num_splits + + file_directory_splits = [] + end = 0 + for idx in range(args.num_splits): + start = end + end = start + num_jobs_per_splits + (1 if idx < num_jobs % args.num_splits else 0) + file_directory_splits.append(sorted_file_dirs[start:end]) + + if args.only_return_keys: + print(file_directory_splits) + else: + print(dict(test_collection_map)) diff --git a/utils/tests_fetcher.py b/utils/tests_fetcher.py index e54e6d0de4..a7a24d6659 100644 --- a/utils/tests_fetcher.py +++ b/utils/tests_fetcher.py @@ -502,7 +502,10 @@ def get_all_doctest_files() -> List[str]: """ py_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.py")] md_files = [str(x.relative_to(PATH_TO_REPO)) for x in PATH_TO_REPO.glob("**/*.md")] + test_files_to_run = py_files + md_files + # change to use "/" as path separator + test_files_to_run = ["/".join(Path(x).parts) for x in test_files_to_run] # only include files in `src` or `docs/source/en/` test_files_to_run = [x for x in test_files_to_run if x.startswith(("src/", "docs/source/en/"))] From fc8eda36c57d83ca1c760cda6ed92c683ada3a9a Mon Sep 17 00:00:00 2001 From: Sai-Suraj-27 Date: Mon, 15 Apr 2024 17:01:37 +0530 Subject: [PATCH 103/110] fix: Fixed `type annotation` for compatability with python 3.8 (#30243) * Fixed type annotation for compatability with python 3.8 * Fixed unsorted imports. --- src/transformers/models/whisper/convert_openai_to_hf.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/whisper/convert_openai_to_hf.py b/src/transformers/models/whisper/convert_openai_to_hf.py index 8d29a8434a..702fbd0204 100755 --- a/src/transformers/models/whisper/convert_openai_to_hf.py +++ b/src/transformers/models/whisper/convert_openai_to_hf.py @@ -21,7 +21,7 @@ import os import tempfile import urllib import warnings -from typing import Any, Optional, Tuple +from typing import Any, List, Optional, Tuple import torch from huggingface_hub.utils import insecure_hashlib @@ -252,7 +252,7 @@ def convert_openai_whisper_to_tfms( # Adapted from https://github.com/openai/tiktoken/issues/60#issuecomment-1499977960 -def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> list[bytes]: +def _bpe(mergeable_ranks, token: bytes, max_rank=None) -> List[bytes]: parts = [bytes([b]) for b in token] while True: min_idx = None From ec344b560d438b40b685674fcf268642c6c1172a Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 15 Apr 2024 12:36:50 +0100 Subject: [PATCH 104/110] Separate out kwargs in processor (#30193) * Separate out kwargs in processor * Fix up --- src/transformers/models/clip/processing_clip.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/clip/processing_clip.py b/src/transformers/models/clip/processing_clip.py index 33fe25a21e..60805402b4 100644 --- a/src/transformers/models/clip/processing_clip.py +++ b/src/transformers/models/clip/processing_clip.py @@ -92,15 +92,21 @@ class CLIPProcessor(ProcessorMixin): `None`). - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. """ + tokenizer_kwargs, image_processor_kwargs = {}, {} + if kwargs: + tokenizer_kwargs = {k: v for k, v in kwargs.items() if k not in self.image_processor._valid_processor_keys} + image_processor_kwargs = { + k: v for k, v in kwargs.items() if k in self.image_processor._valid_processor_keys + } if text is None and images is None: raise ValueError("You have to specify either text or images. Both cannot be none.") if text is not None: - encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs) + encoding = self.tokenizer(text, return_tensors=return_tensors, **tokenizer_kwargs) if images is not None: - image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs) + image_features = self.image_processor(images, return_tensors=return_tensors, **image_processor_kwargs) if text is not None and images is not None: encoding["pixel_values"] = image_features.pixel_values From fe2d20d275d3591e2619a1adb0fa6ae272605208 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 15 Apr 2024 14:10:59 +0200 Subject: [PATCH 105/110] Fix doctest more (for `docs/source/en`) (#30247) * fix * fix --------- Co-authored-by: ydshieh --- docs/source/en/generation_strategies.md | 10 +++++----- docs/source/en/model_doc/code_llama.md | 11 ++++++----- docs/source/en/model_doc/phi.md | 10 ++++++---- docs/source/en/model_doc/stablelm.md | 24 ++++++++++++++---------- docs/source/en/model_doc/starcoder2.md | 3 +-- docs/source/en/model_doc/t5.md | 2 +- docs/source/en/tasks/prompting.md | 4 ++-- utils/slow_documentation_tests.txt | 1 + 8 files changed, 36 insertions(+), 29 deletions(-) diff --git a/docs/source/en/generation_strategies.md b/docs/source/en/generation_strategies.md index b70b17116f..c1d88c90b6 100644 --- a/docs/source/en/generation_strategies.md +++ b/docs/source/en/generation_strategies.md @@ -57,9 +57,10 @@ When you load a model explicitly, you can inspect the generation configuration t >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2") >>> model.generation_config GenerationConfig { - "bos_token_id": 50256, - "eos_token_id": 50256, + "bos_token_id": 50256, + "eos_token_id": 50256 } + ``` Printing out the `model.generation_config` reveals only the values that are different from the default generation @@ -244,8 +245,7 @@ To enable multinomial sampling set `do_sample=True` and `num_beams=1`. >>> outputs = model.generate(**inputs, do_sample=True, num_beams=1, max_new_tokens=100) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Today was an amazing day because when you go to the World Cup and you don\'t, or when you don\'t get invited, -that\'s a terrible feeling."'] +["Today was an amazing day because we received these wonderful items by the way of a gift shop. The box arrived on a Thursday and I opened it on Monday afternoon to receive the gifts. Both bags featured pieces from all the previous years!\n\nThe box had lots of surprises in it, including some sweet little mini chocolate chips! I don't think I'd eat all of these. This was definitely one of the most expensive presents I have ever got, I actually got most of them for free!\n\nThe first package came"] ``` ### Beam-search decoding @@ -393,7 +393,7 @@ just like in multinomial sampling. However, in assisted decoding, reducing the t >>> assistant_model = AutoModelForCausalLM.from_pretrained(assistant_checkpoint) >>> outputs = model.generate(**inputs, assistant_model=assistant_model, do_sample=True, temperature=0.5) >>> tokenizer.batch_decode(outputs, skip_special_tokens=True) -['Alice and Bob are going to the same party. It is a small party, in a small'] +['Alice and Bob, a couple of friends of mine, who are both in the same office as'] ``` Alternativelly, you can also set the `prompt_lookup_num_tokens` to trigger n-gram based assisted decoding, as opposed diff --git a/docs/source/en/model_doc/code_llama.md b/docs/source/en/model_doc/code_llama.md index 38d50c8733..6c05fc8458 100644 --- a/docs/source/en/model_doc/code_llama.md +++ b/docs/source/en/model_doc/code_llama.md @@ -65,9 +65,9 @@ After conversion, the model and tokenizer can be loaded via: >>> tokenizer = CodeLlamaTokenizer.from_pretrained("codellama/CodeLlama-7b-hf") >>> model = LlamaForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf") >>> PROMPT = '''def remove_non_ascii(s: str) -> str: - """ - return result -''' +... """ +... return result +... ''' >>> input_ids = tokenizer(PROMPT, return_tensors="pt")["input_ids"] >>> generated_ids = model.generate(input_ids, max_new_tokens=128) @@ -75,10 +75,10 @@ After conversion, the model and tokenizer can be loaded via: >>> print(PROMPT.replace("", filling)) def remove_non_ascii(s: str) -> str: """ Remove non-ASCII characters from a string. - + Args: s: The string to remove non-ASCII characters from. - + Returns: The string with non-ASCII characters removed. """ @@ -87,6 +87,7 @@ def remove_non_ascii(s: str) -> str: if ord(c) < 128: result += c return result + ``` If you only want the infilled part: diff --git a/docs/source/en/model_doc/phi.md b/docs/source/en/model_doc/phi.md index 96efe4a303..ef163213bf 100644 --- a/docs/source/en/model_doc/phi.md +++ b/docs/source/en/model_doc/phi.md @@ -92,7 +92,9 @@ Phi-2 has been integrated in the development version (4.37.0.dev) of `transforme >>> outputs = model.generate(**inputs, max_length=30) >>> text = tokenizer.batch_decode(outputs)[0] >>> print(text) -'Can you help me write a formal email to a potential business partner proposing a joint venture?\nInput: Company A: ABC Inc.\nCompany B: XYZ Ltd.\nJoint Venture: A new online platform for e-commerce' +Can you help me write a formal email to a potential business partner proposing a joint venture? +Input: Company A: ABC Inc. +Company B ``` ### Example : @@ -134,7 +136,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below: >>> from transformers import PhiForCausalLM, AutoTokenizer >>> # define the model and tokenizer and push the model and tokens to the GPU. ->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda") +>>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda") # doctest: +SKIP >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5") >>> # feel free to change the prompt to your liking. @@ -144,9 +146,9 @@ To load and run a model using Flash Attention 2, refer to the snippet below: >>> tokens = tokenizer(prompt, return_tensors="pt").to("cuda") >>> # use the model to generate new tokens. ->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10) +>>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10) # doctest: +SKIP ->>> tokenizer.batch_decode(generated_output)[0] +>>> tokenizer.batch_decode(generated_output)[0] # doctest: +SKIP 'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled' ``` diff --git a/docs/source/en/model_doc/stablelm.md b/docs/source/en/model_doc/stablelm.md index 90e634b2f7..6a50995ca0 100644 --- a/docs/source/en/model_doc/stablelm.md +++ b/docs/source/en/model_doc/stablelm.md @@ -37,19 +37,21 @@ We also provide `StableLM Zephyr 3B`, an instruction fine-tuned version of the m The following code snippet demonstrates how to use `StableLM 3B 4E1T` for inference: ```python ->>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed >>> device = "cuda" # the device to load the model onto +>>> set_seed(0) + >>> tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") >>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t") ->>> model.to(device) +>>> model.to(device) # doctest: +IGNORE_RESULT >>> model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device) >>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True) >>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) >>> responses -['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to...'] +['The weather is always wonderful in Costa Rica, which makes it a prime destination for retirees. That’s where the Pensionado program comes in, offering'] ``` ## Combining StableLM and Flash Attention 2 @@ -66,19 +68,21 @@ Now, to run the model with Flash Attention 2, refer to the snippet below: ```python >>> import torch ->>> from transformers import AutoModelForCausalLM, AutoTokenizer +>>> from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed >>> device = "cuda" # the device to load the model onto +>>> set_seed(0) + >>> tokenizer = AutoTokenizer.from_pretrained("stabilityai/stablelm-3b-4e1t") ->>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2") ->>> model.to(device) +>>> model = AutoModelForCausalLM.from_pretrained("stabilityai/stablelm-3b-4e1t", torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2") # doctest: +SKIP +>>> model.to(device) # doctest: +SKIP >>> model_inputs = tokenizer("The weather is always wonderful in", return_tensors="pt").to(model.device) ->>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True) ->>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) ->>> responses -['The weather is always wonderful in Santa Barbara and, for visitors hoping to make the move to our beautiful seaside city, this town offers plenty of great places to...'] +>>> generated_ids = model.generate(**model_inputs, max_length=32, do_sample=True) # doctest: +SKIP +>>> responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True) # doctest: +SKIP +>>> responses # doctest: +SKIP +['The weather is always wonderful in Costa Rica, which makes it a prime destination for retirees. That’s where the Pensionado program comes in, offering'] ``` diff --git a/docs/source/en/model_doc/starcoder2.md b/docs/source/en/model_doc/starcoder2.md index 851ee5ea6b..9e2e547b8c 100644 --- a/docs/source/en/model_doc/starcoder2.md +++ b/docs/source/en/model_doc/starcoder2.md @@ -42,11 +42,10 @@ These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hu >>> prompt = "def print_hello_world():" >>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda") ->>> model.to(device) >>> generated_ids = model.generate(**model_inputs, max_new_tokens=10, do_sample=False) >>> tokenizer.batch_decode(generated_ids)[0] -"def print_hello_world():\n\treturn 'Hello World!'" +'def print_hello_world():\n print("Hello World!")\n\ndef print' ``` ## Starcoder2Config diff --git a/docs/source/en/model_doc/t5.md b/docs/source/en/model_doc/t5.md index 70e80c459f..86a645512c 100644 --- a/docs/source/en/model_doc/t5.md +++ b/docs/source/en/model_doc/t5.md @@ -309,7 +309,7 @@ The predicted tokens will then be placed between the sentinel tokens. >>> sequence_ids = model.generate(input_ids) >>> sequences = tokenizer.batch_decode(sequence_ids) >>> sequences -[' park offers the park.'] +[' park offers the park.'] ``` ## Performance diff --git a/docs/source/en/tasks/prompting.md b/docs/source/en/tasks/prompting.md index 1746e36fb9..9100d48396 100644 --- a/docs/source/en/tasks/prompting.md +++ b/docs/source/en/tasks/prompting.md @@ -80,7 +80,7 @@ Run inference with decoder-only models with the `text-generation` pipeline: >>> prompt = "Hello, I'm a language model" >>> generator(prompt, max_length = 30) -[{'generated_text': "Hello, I'm a language model expert, so I'm a big believer in the concept that I know very well and then I try to look into"}] +[{'generated_text': "Hello, I'm a language model programmer so you can use some of my stuff. But you also need some sort of a C program to run."}] ``` To run inference with an encoder-decoder, use the `text2text-generation` pipeline: @@ -284,7 +284,7 @@ the leading word or phrase (`"Answer:"`) to nudge the model to start generating >>> for seq in sequences: ... print(f"Result: {seq['generated_text']}") -Result: Modern tools are used, such as immersion blenders +Result: Modern tools often used to make gazpacho include ``` #### Reasoning diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt index e36eae6e2d..65e05ed893 100644 --- a/utils/slow_documentation_tests.txt +++ b/utils/slow_documentation_tests.txt @@ -1,4 +1,5 @@ docs/source/en/generation_strategies.md +docs/source/en/model_doc/code_llama.md docs/source/en/model_doc/ctrl.md docs/source/en/model_doc/kosmos-2.md docs/source/en/model_doc/seamless_m4t.md From 766810153b3a701e99229229c3899e370be7b7a1 Mon Sep 17 00:00:00 2001 From: LZR Date: Mon, 15 Apr 2024 05:53:21 -0700 Subject: [PATCH 106/110] round epoch only in console (#30237) --- src/transformers/trainer.py | 2 +- src/transformers/trainer_callback.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 5b8ffeafc7..45b45992bf 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -3048,7 +3048,7 @@ class Trainer: The values to log. """ if self.state.epoch is not None: - logs["epoch"] = round(self.state.epoch, 2) + logs["epoch"] = self.state.epoch if self.args.include_num_input_tokens_seen: logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen diff --git a/src/transformers/trainer_callback.py b/src/transformers/trainer_callback.py index f5bbcdbd42..225f645d63 100644 --- a/src/transformers/trainer_callback.py +++ b/src/transformers/trainer_callback.py @@ -15,6 +15,7 @@ """ Callbacks to use with the Trainer class and customize the training loop. """ +import copy import dataclasses import json from dataclasses import dataclass @@ -520,7 +521,12 @@ class ProgressCallback(TrainerCallback): def on_log(self, args, state, control, logs=None, **kwargs): if state.is_world_process_zero and self.training_bar is not None: + # avoid modifying the logs object as it is shared between callbacks + logs = copy.deepcopy(logs) _ = logs.pop("total_flos", None) + # round numbers so that it looks better in console + if "epoch" in logs: + logs["epoch"] = round(logs["epoch"], 2) self.training_bar.write(str(logs)) def on_train_end(self, args, state, control, **kwargs): From 440bd3c3c06004115752c57530c7150b38f2a0a7 Mon Sep 17 00:00:00 2001 From: Yih-Dar <2521628+ydshieh@users.noreply.github.com> Date: Mon, 15 Apr 2024 15:08:09 +0200 Subject: [PATCH 107/110] update github actions packages' version to suppress warnings (#30249) update Co-authored-by: ydshieh --- .github/workflows/add-model-like.yml | 4 ++-- .github/workflows/build-docker-images.yml | 18 +++++++------- .../build-nightly-ci-docker-images.yml | 4 ++-- .../workflows/build-past-ci-docker-images.yml | 4 ++-- .github/workflows/check_tiny_models.yml | 10 ++++---- .github/workflows/doctest_job.yml | 2 +- .github/workflows/doctests.yml | 6 ++--- .github/workflows/model-templates.yml | 4 ++-- .github/workflows/model_jobs.yml | 2 +- .github/workflows/push-important-models.yml | 8 +++---- .github/workflows/self-nightly-scheduled.yml | 10 ++++---- .github/workflows/self-past.yml | 12 +++++----- .github/workflows/self-push-amd.yml | 10 ++++---- .github/workflows/self-push-caller.yml | 2 +- .github/workflows/self-push.yml | 14 +++++------ .github/workflows/self-scheduled-amd.yml | 24 +++++++++---------- .github/workflows/self-scheduled.yml | 16 ++++++------- .github/workflows/slack-report.yml | 10 ++++---- .github/workflows/stale.yml | 2 +- .github/workflows/update_metdata.yml | 2 +- utils/extract_warnings.py | 2 +- 21 files changed, 83 insertions(+), 83 deletions(-) diff --git a/.github/workflows/add-model-like.yml b/.github/workflows/add-model-like.yml index 8bdd66e446..5a1b953ef6 100644 --- a/.github/workflows/add-model-like.yml +++ b/.github/workflows/add-model-like.yml @@ -16,7 +16,7 @@ jobs: name: "Add new model like template tests" runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Install dependencies run: | @@ -74,7 +74,7 @@ jobs: - name: Test suite reports artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: run_all_tests_new_models_test_reports path: reports/tests_new_models diff --git a/.github/workflows/build-docker-images.yml b/.github/workflows/build-docker-images.yml index 23424ffb83..7c9e86d091 100644 --- a/.github/workflows/build-docker-images.yml +++ b/.github/workflows/build-docker-images.yml @@ -27,7 +27,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -76,7 +76,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -113,7 +113,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -145,7 +145,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -181,7 +181,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -207,7 +207,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -248,7 +248,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -274,7 +274,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 @@ -315,7 +315,7 @@ jobs: uses: docker/setup-buildx-action@v3 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v3 diff --git a/.github/workflows/build-nightly-ci-docker-images.yml b/.github/workflows/build-nightly-ci-docker-images.yml index 63bc7daa74..d7c18775a8 100644 --- a/.github/workflows/build-nightly-ci-docker-images.yml +++ b/.github/workflows/build-nightly-ci-docker-images.yml @@ -30,7 +30,7 @@ jobs: uses: docker/setup-buildx-action@v2 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v2 @@ -67,7 +67,7 @@ jobs: uses: docker/setup-buildx-action@v2 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Login to DockerHub uses: docker/login-action@v2 diff --git a/.github/workflows/build-past-ci-docker-images.yml b/.github/workflows/build-past-ci-docker-images.yml index 302386b685..5ef7c7e7de 100644 --- a/.github/workflows/build-past-ci-docker-images.yml +++ b/.github/workflows/build-past-ci-docker-images.yml @@ -23,7 +23,7 @@ jobs: uses: docker/setup-buildx-action@v2 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - id: get-base-image name: Get Base Image @@ -67,7 +67,7 @@ jobs: uses: docker/setup-buildx-action@v2 - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - id: get-base-image name: Get Base Image diff --git a/.github/workflows/check_tiny_models.yml b/.github/workflows/check_tiny_models.yml index 0725bd04a1..56a84f776b 100644 --- a/.github/workflows/check_tiny_models.yml +++ b/.github/workflows/check_tiny_models.yml @@ -17,11 +17,11 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout transformers - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 2 - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Set up Python 3.8 uses: actions/setup-python@v4 with: @@ -44,7 +44,7 @@ jobs: - name: Local tiny model reports artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: tiny_local_model_creation_reports path: tiny_local_models/reports @@ -56,7 +56,7 @@ jobs: - name: Test suite reports artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: tiny_local_model_creation_reports path: reports/tests_pipelines @@ -76,7 +76,7 @@ jobs: - name: New tiny model creation reports artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: tiny_model_creation_reports path: tiny_models/reports diff --git a/.github/workflows/doctest_job.yml b/.github/workflows/doctest_job.yml index 3e923c1c8f..994c1b5085 100644 --- a/.github/workflows/doctest_job.yml +++ b/.github/workflows/doctest_job.yml @@ -75,7 +75,7 @@ jobs: - name: "Test suite reports artifacts: doc_tests_gpu_test_reports_${{ env.split_keys }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: doc_tests_gpu_test_reports_${{ env.split_keys }} path: /transformers/reports/doc_tests_gpu_${{ env.split_keys }} diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml index 014740685e..ad2366751d 100644 --- a/.github/workflows/doctests.yml +++ b/.github/workflows/doctests.yml @@ -67,8 +67,8 @@ jobs: if: always() needs: [call_doctest_job] steps: - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} @@ -81,7 +81,7 @@ jobs: - name: "Upload results" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: doc_test_results path: doc_test_results \ No newline at end of file diff --git a/.github/workflows/model-templates.yml b/.github/workflows/model-templates.yml index eb77d9dcbe..d34a28508e 100644 --- a/.github/workflows/model-templates.yml +++ b/.github/workflows/model-templates.yml @@ -10,7 +10,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install dependencies run: | @@ -75,7 +75,7 @@ jobs: - name: Test suite reports artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: run_all_tests_templates_test_reports path: reports/tests_templates diff --git a/.github/workflows/model_jobs.yml b/.github/workflows/model_jobs.yml index 8bf8d78570..978e5f617e 100644 --- a/.github/workflows/model_jobs.yml +++ b/.github/workflows/model_jobs.yml @@ -96,7 +96,7 @@ jobs: - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ inputs.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ inputs.machine_type }}_tests_gpu_${{ matrix.folders }} diff --git a/.github/workflows/push-important-models.yml b/.github/workflows/push-important-models.yml index 5eef6f40f8..a6ea03277d 100644 --- a/.github/workflows/push-important-models.yml +++ b/.github/workflows/push-important-models.yml @@ -25,7 +25,7 @@ jobs: matrix: ${{ steps.set-matrix.outputs.matrix }} steps: - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Get changed files id: changed-files @@ -65,7 +65,7 @@ jobs: steps: - name: Check out code - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install locally transformers & other libs run: | @@ -90,7 +90,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.model-name }}_fa2_tests path: /transformers/reports/${{ matrix.model-name }}_fa2_tests @@ -112,7 +112,7 @@ jobs: - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: tests_integration_${{ matrix.model-name }} path: /transformers/reports/tests_integration_${{ matrix.model-name }} diff --git a/.github/workflows/self-nightly-scheduled.yml b/.github/workflows/self-nightly-scheduled.yml index 5c3e30e4b4..7906325e83 100644 --- a/.github/workflows/self-nightly-scheduled.yml +++ b/.github/workflows/self-nightly-scheduled.yml @@ -117,7 +117,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -178,7 +178,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_nightly path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -240,7 +240,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_nightly path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu @@ -262,8 +262,8 @@ jobs: run: | echo "Setup status: ${{ needs.setup.result }}" - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} diff --git a/.github/workflows/self-past.yml b/.github/workflows/self-past.yml index 6b7587fdeb..7be658c432 100644 --- a/.github/workflows/self-past.yml +++ b/.github/workflows/self-past.yml @@ -143,7 +143,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -223,7 +223,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -295,7 +295,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports_postfix_${{ inputs.framework }}-${{ inputs.version }} path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu @@ -317,8 +317,8 @@ jobs: run: | echo "Setup status: ${{ needs.setup.result }}" - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 # Create a directory to store test failure tables in the next step - name: Create directory @@ -344,7 +344,7 @@ jobs: # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: test_failure_tables_${{ inputs.framework }}-${{ inputs.version }} path: test_failure_tables diff --git a/.github/workflows/self-push-amd.yml b/.github/workflows/self-push-amd.yml index 4bd7c1f487..b285a5f8fc 100644 --- a/.github/workflows/self-push-amd.yml +++ b/.github/workflows/self-push-amd.yml @@ -23,7 +23,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout transformers - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 2 @@ -121,7 +121,7 @@ jobs: python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - name: Report fetched tests - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: test_fetched path: /transformers/test_preparation.txt @@ -239,7 +239,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -288,7 +288,7 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # To avoid failure when multiple commits are merged into `main` in a short period of time. # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ... # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit) @@ -303,7 +303,7 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} diff --git a/.github/workflows/self-push-caller.yml b/.github/workflows/self-push-caller.yml index 14b5262426..59adde4c54 100644 --- a/.github/workflows/self-push-caller.yml +++ b/.github/workflows/self-push-caller.yml @@ -19,7 +19,7 @@ jobs: outputs: changed: ${{ steps.was_changed.outputs.changed }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: "2" diff --git a/.github/workflows/self-push.yml b/.github/workflows/self-push.yml index fd823ce4f5..17dff31fa4 100644 --- a/.github/workflows/self-push.yml +++ b/.github/workflows/self-push.yml @@ -97,7 +97,7 @@ jobs: python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt - name: Report fetched tests - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: test_fetched path: /transformers/test_preparation.txt @@ -209,7 +209,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -304,7 +304,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -394,7 +394,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu @@ -484,7 +484,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu @@ -530,7 +530,7 @@ jobs: echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" echo "env.CI_SHA = ${{ env.CI_SHA }}" - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 # To avoid failure when multiple commits are merged into `main` in a short period of time. # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ... # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit) @@ -545,7 +545,7 @@ jobs: git checkout ${{ env.CI_SHA }} echo "log = $(git log -n 1)" - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} diff --git a/.github/workflows/self-scheduled-amd.yml b/.github/workflows/self-scheduled-amd.yml index 69f5f861a3..0992607180 100644 --- a/.github/workflows/self-scheduled-amd.yml +++ b/.github/workflows/self-scheduled-amd.yml @@ -29,7 +29,7 @@ jobs: runs-on: ubuntu-22.04 steps: - name: Checkout transformers - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 2 @@ -171,7 +171,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -239,7 +239,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_gpu_${{ matrix.folders }} @@ -296,7 +296,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_examples_gpu path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu @@ -352,7 +352,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu @@ -409,7 +409,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_deepspeed_gpu_test_reports path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_deepspeed_gpu @@ -430,7 +430,7 @@ jobs: ] steps: - name: Checkout transformers - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 2 @@ -443,7 +443,7 @@ jobs: - name: Create output directory run: mkdir warnings_in_ci - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: path: warnings_in_ci @@ -458,7 +458,7 @@ jobs: - name: Upload artifact if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: warnings_in_ci path: warnings_in_ci/selected_warnings.json @@ -487,8 +487,8 @@ jobs: echo "Runner status: ${{ needs.check_runners.result }}" echo "Setup status: ${{ needs.setup.result }}" - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 - name: Send message to Slack env: CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} @@ -513,7 +513,7 @@ jobs: # Upload complete failure tables, as they might be big and only truncated versions could be sent to Slack. - name: Failure table artifacts if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: test_failure_tables path: test_failure_tables diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 3590bb9f84..fa41bffc0b 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -143,7 +143,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_torch_pipeline_gpu @@ -194,7 +194,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_tf_pipeline_gpu path: /transformers/reports/${{ matrix.machine_type }}_tests_tf_pipeline_gpu @@ -245,7 +245,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_examples_gpu" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_examples_gpu path: /transformers/reports/${{ matrix.machine_type }}_examples_gpu @@ -305,7 +305,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_torch_cuda_extensions_gpu_test_reports path: /workspace/transformers/reports/${{ matrix.machine_type }}_tests_torch_cuda_extensions_gpu @@ -366,7 +366,7 @@ jobs: - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }}" if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: ${{ matrix.machine_type }}_run_tests_quantization_torch_gpu_${{ env.matrix_folders }} path: /transformers/reports/${{ matrix.machine_type }}_tests_quantization_torch_gpu_${{ matrix.folders }} @@ -379,7 +379,7 @@ jobs: needs: [setup, run_tests_gpu] steps: - name: Checkout transformers - uses: actions/checkout@v3 + uses: actions/checkout@v4 with: fetch-depth: 2 @@ -392,7 +392,7 @@ jobs: - name: Create output directory run: mkdir warnings_in_ci - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: path: warnings_in_ci @@ -407,7 +407,7 @@ jobs: - name: Upload artifact if: ${{ always() }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: warnings_in_ci path: warnings_in_ci/selected_warnings.json diff --git a/.github/workflows/slack-report.yml b/.github/workflows/slack-report.yml index 9e62417c76..5c46037554 100644 --- a/.github/workflows/slack-report.yml +++ b/.github/workflows/slack-report.yml @@ -32,8 +32,8 @@ jobs: run: | echo "Setup status: ${{ inputs.setup_status }}" - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 - name: Send message to Slack if: ${{ inputs.job != 'run_tests_quantization_torch_gpu' }} env: @@ -58,8 +58,8 @@ jobs: pip show slack_sdk python utils/notification_service.py "${{ inputs.folder_slices }}" - - uses: actions/checkout@v3 - - uses: actions/download-artifact@v3 + - uses: actions/checkout@v4 + - uses: actions/download-artifact@v4 - name: Send message to Slack for quantization workflow if: ${{ inputs.job == 'run_tests_quantization_torch_gpu' }} env: @@ -81,7 +81,7 @@ jobs: - name: Failure table artifacts # Only the model testing job is concerned for this step if: ${{ inputs.job == 'run_tests_gpu' }} - uses: actions/upload-artifact@v3 + uses: actions/upload-artifact@v4 with: name: prev_ci_results path: prev_ci_results diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 4a7e94bac4..4fd4a8cb7b 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -12,7 +12,7 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup Python uses: actions/setup-python@v4 diff --git a/.github/workflows/update_metdata.yml b/.github/workflows/update_metdata.yml index a2269e32e4..90cd73077a 100644 --- a/.github/workflows/update_metdata.yml +++ b/.github/workflows/update_metdata.yml @@ -14,7 +14,7 @@ jobs: shell: bash -l {0} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Setup environment run: | diff --git a/utils/extract_warnings.py b/utils/extract_warnings.py index 38c6ac5ecb..5f49b1b456 100644 --- a/utils/extract_warnings.py +++ b/utils/extract_warnings.py @@ -108,7 +108,7 @@ if __name__ == "__main__": from_gh = args.from_gh if from_gh: - # The artifacts have to be downloaded using `actions/download-artifact@v3` + # The artifacts have to be downloaded using `actions/download-artifact@v4` pass else: os.makedirs(args.output_dir, exist_ok=True) From 667939a2d327cc4db80fe4879f86c4f999dcb490 Mon Sep 17 00:00:00 2001 From: Fanli Lin Date: Mon, 15 Apr 2024 23:30:52 +0800 Subject: [PATCH 108/110] [tests] add the missing `require_torch_multi_gpu` flag (#30250) add gpu flag --- tests/models/bros/test_modeling_bros.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/models/bros/test_modeling_bros.py b/tests/models/bros/test_modeling_bros.py index 755deefcb4..4b1290ed49 100644 --- a/tests/models/bros/test_modeling_bros.py +++ b/tests/models/bros/test_modeling_bros.py @@ -17,7 +17,7 @@ import copy import unittest -from transformers.testing_utils import require_torch, slow, torch_device +from transformers.testing_utils import require_torch, require_torch_multi_gpu, slow, torch_device from transformers.utils import is_torch_available from ...test_configuration_common import ConfigTester @@ -344,6 +344,7 @@ class BrosModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + @require_torch_multi_gpu def test_multi_gpu_data_parallel_forward(self): super().test_multi_gpu_data_parallel_forward() From 6b78360e6d686b316360334f5109b46c39ff5ed8 Mon Sep 17 00:00:00 2001 From: amyeroberts <22614925+amyeroberts@users.noreply.github.com> Date: Mon, 15 Apr 2024 17:03:03 +0100 Subject: [PATCH 109/110] Add Idefics2 (#30253) * Initial add model additions * Test * All weights loading * Can perform full forward pass * Local and remote the same * Matching local and remote * Fixup * Idefics2Model importable; fixup docstrings * Don't skip by default * Remove deprecated use_resampler arg * Remove self.config * DecoupledLinear takes config * Tidy up * Enable eager attention and tidy up * Most tests passing * Update for batch of processed images * Add image processor * Update doc pages * Update conversion script * Remove erroneous breakpoint * Remove accidendtal spelling change * Update to reflect changes on hub - make generate work * Fix up * Image processor tests * Update tests * Add a processor * Add a processor * Update convert script * Update modeling file - remove fixmes * Bug fix * Add processing test * Use processor * Fix up * Update src/transformers/models/idefics2/modeling_idefics2.py Co-authored-by: Victor SANH * Update src/transformers/models/idefics2/modeling_idefics2.py Co-authored-by: Victor SANH * Fix test * Update config - PR comments and defaults align with checkpoint * Reviewer comments * Add copied froms for flahs attention * Update src/transformers/models/idefics2/modeling_idefics2.py Co-authored-by: Victor SANH * Apply suggestions from code review Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Remove qk_layer_norm and freeze_layers functionality * Fix * Remove freeze_layer options from config * Sync with upstream main * Fix attention shapes siglip * Remove Llava-next refs - TO REBASE * Use AutoModel for text model * Add comment to explain vision embeddings * Fix issue with tie_word_embeddings * Address review comments * Fix and fix up * Chat templates for idefics * Fix copies * Fix * Add layer norms to FA2 * Fix tests * Apply suggestions from code review Co-authored-by: Victor SANH * Fix * Review comments * Update src/transformers/models/idefics2/modeling_idefics2.py Co-authored-by: Victor SANH * Update inputs merger * Merge weights in correct order * Update convert script * Update src/transformers/models/idefics2/processing_idefics2.py Co-authored-by: Victor SANH * Update template * Model code examples (fix idefics too) * More review comments * Tidy up * Update processing * Fix attention mask preparation * Update inputs_merger inputs * Vectorize inputs_merger * Update src/transformers/models/idefics2/__init__.py Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> * Update src/transformers/models/idefics2/modeling_idefics2.py * Review comments * saying bye to the `qk_layer_norms` * Simplify * Update latents * Remove erroneuous readme changes * Return images when applying chat template * Fix bug - prompt images are for a single sample * Update src/transformers/models/idefics2/modeling_idefics2.py * image splitting * fix test * some more comment * some comment * Apply suggestions from code review Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/models/idefics2/image_processing_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update processor * Update model tests * Update src/transformers/models/idefics2/processing_idefics2.py Co-authored-by: Victor SANH * Update src/transformers/models/idefics2/processing_idefics2.py Co-authored-by: Victor SANH * Don't add BOS in template * Update src/transformers/models/idefics2/processing_idefics2.py Co-authored-by: Victor SANH * Remove index in examples * Update tests to reflect #13 * Update src/transformers/models/idefics2/processing_idefics2.py Co-authored-by: Victor SANH * PR comment - consistent typing * Update readme and model doc * Update docs * Update checkpoint references * Update examples * Fix and update tests * Small addition * Update tests - remove copied from as no ignore placement copy could be found * Update example * small fixes * Update docs/source/en/model_doc/idefics2.md Co-authored-by: Victor SANH * Update docs/source/en/model_doc/idefics2.md Co-authored-by: Victor SANH * Update README.md Co-authored-by: Victor SANH * Connector model as bridge * Fix up * Fix up * Don't pass model inputs for generation kwargs update * IDEFICS-2 -> Idefics2 * Remove config archive name * IDEFICS-2 -> Idefics2 * Add back llava-next * Update readmes * Add requirements for processor tester * Use custom convert_to_rgb to avoid possible BC * Fix doc example * Fix doc example * Skip model doc tests - as model to large * More doc example - account for image splitting * Update src/transformers/image_transforms.py * Fix config doctest --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> Co-authored-by: ArthurZucker Co-authored-by: Victor SANH Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com> --- README.md | 3 +- README_de.md | 3 +- README_es.md | 11 +- README_fr.md | 1 + README_hd.md | 1 + README_ja.md | 1 + README_ko.md | 1 + README_pt-br.md | 9 +- README_ru.md | 9 +- README_te.md | 7 +- README_vi.md | 3 +- README_zh-hans.md | 1 + README_zh-hant.md | 7 +- docs/source/en/_toctree.yml | 2 + docs/source/en/index.md | 1 + docs/source/en/model_doc/idefics2.md | 98 + docs/source/en/perf_infer_gpu_one.md | 11 +- src/transformers/__init__.py | 20 + src/transformers/image_transforms.py | 4 +- src/transformers/image_utils.py | 2 +- src/transformers/models/__init__.py | 1 + .../models/auto/configuration_auto.py | 3 + .../models/auto/image_processing_auto.py | 1 + src/transformers/models/auto/modeling_auto.py | 3 + .../models/auto/processing_auto.py | 1 + .../models/auto/tokenization_auto.py | 1 + .../models/idefics/modeling_idefics.py | 25 +- src/transformers/models/idefics2/__init__.py | 74 + .../models/idefics2/configuration_idefics2.py | 262 +++ .../convert_idefics2_weights_to_hf.py | 185 ++ .../idefics2/image_processing_idefics2.py | 596 +++++ .../models/idefics2/modeling_idefics2.py | 1959 +++++++++++++++++ .../models/idefics2/processing_idefics2.py | 348 +++ src/transformers/utils/dummy_pt_objects.py | 31 + .../utils/dummy_vision_objects.py | 7 + tests/models/idefics2/__init__.py | 0 .../test_image_processing_idefics2.py | 270 +++ .../models/idefics2/test_modeling_idefics2.py | 528 +++++ .../idefics2/test_processing_idefics2.py | 235 ++ tests/test_modeling_common.py | 2 + utils/slow_documentation_tests.txt | 3 +- 41 files changed, 4692 insertions(+), 38 deletions(-) create mode 100644 docs/source/en/model_doc/idefics2.md create mode 100644 src/transformers/models/idefics2/__init__.py create mode 100644 src/transformers/models/idefics2/configuration_idefics2.py create mode 100644 src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py create mode 100644 src/transformers/models/idefics2/image_processing_idefics2.py create mode 100644 src/transformers/models/idefics2/modeling_idefics2.py create mode 100644 src/transformers/models/idefics2/processing_idefics2.py create mode 100644 tests/models/idefics2/__init__.py create mode 100644 tests/models/idefics2/test_image_processing_idefics2.py create mode 100644 tests/models/idefics2/test_modeling_idefics2.py create mode 100644 tests/models/idefics2/test_processing_idefics2.py diff --git a/README.md b/README.md index de844848a4..b31083cdfb 100644 --- a/README.md +++ b/README.md @@ -331,7 +331,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -395,6 +395,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the blog [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. diff --git a/README_de.md b/README_de.md index e5bd3522ca..5891710ef5 100644 --- a/README_de.md +++ b/README_de.md @@ -327,7 +327,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -391,6 +391,7 @@ Aktuelle Anzahl der Checkpoints: ![](https://img.shields.io/endpoint?url=https:/ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. diff --git a/README_es.md b/README_es.md index 5f6a2afb7c..d42a0e69d4 100644 --- a/README_es.md +++ b/README_es.md @@ -304,7 +304,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -368,6 +368,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. @@ -444,7 +445,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. -1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. @@ -473,9 +474,9 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. -1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries. -1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. +1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** (from Stability AI) released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. +1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with a coming soon paper. +1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. 1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo. 1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo. diff --git a/README_fr.md b/README_fr.md index 9c6f71d324..01d6948cfc 100644 --- a/README_fr.md +++ b/README_fr.md @@ -389,6 +389,7 @@ Nombre actuel de points de contrôle : ![](https://img.shields.io/endpoint?url=h 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (de Facebook) a été publié dans l'article [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) par Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (de Berkeley) a été publié dans l'article [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) par Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (de HuggingFace) a été publié dans l'article [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) par Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (de Hugging Face) publié dans l'article [IDEFICS2](https://huggingface.co/blog/idefics2) parLéo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (d'OpenAI) a été publié dans l'article [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) par Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (de l'Université de Beihang, UC Berkeley, Rutgers University, SEDD Company) a été publié dans l'article [Informer : Au-delà du Transformer efficace pour la prévision de séries temporel 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (de Salesforce) a été publié dans l'article [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) de Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. diff --git a/README_hd.md b/README_hd.md index 19dfa18b26..11858a2e89 100644 --- a/README_hd.md +++ b/README_hd.md @@ -342,6 +342,7 @@ conda install conda-forge::transformers 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (फेसबुक से) साथ में पेपर [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) वेई-निंग सू, बेंजामिन बोल्टे, याओ-हंग ह्यूबर्ट त्साई, कुशाल लखोटिया, रुस्लान सालाखुतदीनोव, अब्देलरहमान मोहम्मद द्वारा। 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (बर्कले से) साथ में कागज [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) सेहून किम, अमीर घोलमी, ज़ेवेई याओ, माइकल डब्ल्यू महोनी, कर्ट केटज़र द्वारा। 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (Hugging Face से) Léo Tronchon, Hugo Laurencon, Victor Sanh. द्वाराअनुसंधान पत्र [IDEFICS2](https://huggingface.co/blog/idefics2) के साथ जारी किया गया 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया diff --git a/README_ja.md b/README_ja.md index 443f650cae..9ae31c329c 100644 --- a/README_ja.md +++ b/README_ja.md @@ -402,6 +402,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook から) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed から公開された研究論文: [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley から) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer から公開された研究論文: [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (Hugging Face から) Léo Tronchon, Hugo Laurencon, Victor Sanh. から公開された研究論文 [IDEFICS2](https://huggingface.co/blog/idefics2) 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI から) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever から公開された研究論文: [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) diff --git a/README_ko.md b/README_ko.md index dfb271b737..8919bf5c62 100644 --- a/README_ko.md +++ b/README_ko.md @@ -317,6 +317,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (Facebook 에서) Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 의 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 논문과 함께 발표했습니다. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (Berkeley 에서) Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 의 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 논문과 함께 발표했습니다. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (Hugging Face 에서 제공)은 Léo Tronchon, Hugo Laurencon, Victor Sanh.의 [IDEFICS2](https://huggingface.co/blog/idefics2)논문과 함께 발표했습니다. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (OpenAI 에서) Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 의 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 논문과 함께 발표했습니다. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다. diff --git a/README_pt-br.md b/README_pt-br.md index 8dcdddac00..8db6b67452 100644 --- a/README_pt-br.md +++ b/README_pt-br.md @@ -333,10 +333,10 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. -1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. +1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -380,7 +380,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le. -1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b) +1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b) 1. **[Gemma](https://huggingface.co/docs/transformers/model_doc/gemma)** (from Google) released with the paper [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by the Gemma Google team. 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim. @@ -400,6 +400,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. @@ -436,7 +437,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. -1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. +1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. diff --git a/README_ru.md b/README_ru.md index 7ee48cbcaf..a4b234b847 100644 --- a/README_ru.md +++ b/README_ru.md @@ -323,10 +323,10 @@ conda install conda-forge::transformers 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. -1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. +1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -390,6 +390,7 @@ conda install conda-forge::transformers 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. @@ -425,8 +426,8 @@ conda install conda-forge::transformers 1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. -1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. -1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. +1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. +1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. diff --git a/README_te.md b/README_te.md index 687a97d697..78e5d540f4 100644 --- a/README_te.md +++ b/README_te.md @@ -325,10 +325,10 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov. 1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever. 1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker. -1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. +1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -392,6 +392,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. @@ -428,7 +429,7 @@ Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టా 1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro. 1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao. 1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. -1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. +1. **[Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed. 1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka. 1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli. 1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. diff --git a/README_vi.md b/README_vi.md index 1a872b9ce5..4e14331970 100644 --- a/README_vi.md +++ b/README_vi.md @@ -327,7 +327,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** được phát hành với bài báo [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (từ Salesforce) được phát hành với bài báo [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (từ MetaAI) được phát hành với bài báo [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (từ Cohere) được phát hành với bài báo [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (từ Cohere) được phát hành với bài báo [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (từ Microsoft Research Asia) được phát hành với bài báo [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (từ YituTech) được phát hành với bài báo [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (từ Facebook AI) được phát hành với bài báo [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -391,6 +391,7 @@ Số lượng điểm kiểm tra hiện tại: ![](https://img.shields.io/endpoi 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (từ Facebook) được phát hành với bài báo [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (từ Berkeley) được phát hành với bài báo [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (từ HuggingFace) được phát hành với bài báo [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (từ Hugging Face) được phát hành với bài báo [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (từ OpenAI) được phát hành với bài báo [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (từ Beihang University, UC Berkeley, Rutgers University, SEDD Company) được phát hành với bài báo [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (từ Salesforce) được phát hành với bài báo [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. diff --git a/README_zh-hans.md b/README_zh-hans.md index 7e307c781a..9867293298 100644 --- a/README_zh-hans.md +++ b/README_zh-hans.md @@ -341,6 +341,7 @@ conda install conda-forge::transformers 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (来自 Hugging Face) 伴随论文 [IDEFICS2](https://huggingface.co/blog/idefics2) 由 Léo Tronchon, Hugo Laurencon, Victor Sanh 发布。 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (来自 OpenAI) 伴随论文 [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) 由 Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever 发布。 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。 diff --git a/README_zh-hant.md b/README_zh-hant.md index 1431293df5..7c72d8e6a0 100644 --- a/README_zh-hant.md +++ b/README_zh-hant.md @@ -289,7 +289,7 @@ conda install conda-forge::transformers 1. **[CLVP](https://huggingface.co/docs/transformers/model_doc/clvp)** released with the paper [Better speech synthesis through scaling](https://arxiv.org/abs/2305.07243) by James Betker. 1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong. 1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve. -1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. +1. **[Cohere](https://huggingface.co/docs/transformers/model_doc/cohere)** (from Cohere) released with the paper [Command-R: Retrieval Augmented Generation at Production Scale]() by Cohere. 1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang. 1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan. 1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie. @@ -353,6 +353,7 @@ conda install conda-forge::transformers 1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed. 1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer. 1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh. +1. **[Idefics2](https://huggingface.co/docs/transformers/main/model_doc/idefics2)** (from Hugging Face) released with the paper [IDEFICS2](https://huggingface.co/blog/idefics2) by Léo Tronchon, Hugo Laurencon, Victor Sanh. 1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever. 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang. 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. @@ -429,7 +430,7 @@ conda install conda-forge::transformers 1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi, Kyogu Lee. 1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou. 1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. -1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. +1. **[PVTv2](https://huggingface.co/docs/transformers/model_doc/pvt_v2)** (from Shanghai AI Laboratory, Nanjing University, The University of Hong Kong etc.) released with the paper [PVT v2: Improved Baselines with Pyramid Vision Transformer](https://arxiv.org/abs/2106.13797) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao. 1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius. 1. **[Qwen2](https://huggingface.co/docs/transformers/model_doc/qwen2)** (from the Qwen team, Alibaba Group) released with the paper [Qwen Technical Report](https://arxiv.org/abs/2309.16609) by Jinze Bai, Shuai Bai, Yunfei Chu, Zeyu Cui, Kai Dang, Xiaodong Deng, Yang Fan, Wenbin Ge, Yu Han, Fei Huang, Binyuan Hui, Luo Ji, Mei Li, Junyang Lin, Runji Lin, Dayiheng Liu, Gao Liu, Chengqiang Lu, Keming Lu, Jianxin Ma, Rui Men, Xingzhang Ren, Xuancheng Ren, Chuanqi Tan, Sinan Tan, Jianhong Tu, Peng Wang, Shijie Wang, Wei Wang, Shengguang Wu, Benfeng Xu, Jin Xu, An Yang, Hao Yang, Jian Yang, Shusheng Yang, Yang Yao, Bowen Yu, Hongyi Yuan, Zheng Yuan, Jianwei Zhang, Xingxuan Zhang, Yichang Zhang, Zhenru Zhang, Chang Zhou, Jingren Zhou, Xiaohuan Zhou and Tianhang Zhu. 1. **[Qwen2MoE](https://huggingface.co/docs/transformers/main/model_doc/qwen2_moe)** (from the Qwen team, Alibaba Group) released with the paper [blog post](https://qwenlm.github.io/blog/qwen-moe/) by Bo Zheng, Dayiheng Liu, Rui Men, Junyang Lin, Zhou San, Bowen Yu, An Yang, Mingfeng Xue, Fei Huang, Binyuan Hui, Mei Li, Tianyu Liu, Xingzhang Ren, Xuancheng Ren, Kexin Yang, Chang Zhou, Jingren Zhou. @@ -458,7 +459,7 @@ conda install conda-forge::transformers 1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau. 1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. 1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer. -1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. +1. **[StableLm](https://huggingface.co/docs/transformers/model_doc/stablelm)** released with the paper [StableLM 3B 4E1T (Technical Report)](https://stability.wandb.io/stability-llm/stable-lm/reports/StableLM-3B-4E1T--VmlldzoyMjU4?accessToken=u3zujipenkx5g7rtcj9qojjgxpconyjktjkli2po09nffrffdhhchq045vp0wyfo) by Jonathan Tow, Marco Bellagente, Dakota Mahan, Carlos Riquelme Ruiz, Duy Phung, Maksym Zhuravinskyi, Nathan Cooper, Nikhil Pinnaparaju, Reshinth Adithyan, and James Baicoianu. 1. **[Starcoder2](https://huggingface.co/docs/transformers/model_doc/starcoder2)** (from BigCode team) released with the paper [StarCoder 2 and The Stack v2: The Next Generation](https://arxiv.org/abs/2402.19173) by Anton Lozhkov, Raymond Li, Loubna Ben Allal, Federico Cassano, Joel Lamy-Poirier, Nouamane Tazi, Ao Tang, Dmytro Pykhtar, Jiawei Liu, Yuxiang Wei, Tianyang Liu, Max Tian, Denis Kocetkov, Arthur Zucker, Younes Belkada, Zijian Wang, Qian Liu, Dmitry Abulkhanov, Indraneil Paul, Zhuang Li, Wen-Ding Li, Megan Risdal, Jia Li, Jian Zhu, Terry Yue Zhuo, Evgenii Zheltonozhskii, Nii Osae Osae Dade, Wenhao Yu, Lucas Krauß, Naman Jain, Yixuan Su, Xuanli He, Manan Dey, Edoardo Abati, Yekun Chai, Niklas Muennighoff, Xiangru Tang, Muhtasham Oblokulov, Christopher Akiki, Marc Marone, Chenghao Mou, Mayank Mishra, Alex Gu, Binyuan Hui, Tri Dao, Armel Zebaze, Olivier Dehaene, Nicolas Patry, Canwen Xu, Julian McAuley, Han Hu, Torsten Scholak, Sebastien Paquet, Jennifer Robinson, Carolyn Jane Anderson, Nicolas Chapados, Mostofa Patwary, Nima Tajbakhsh, Yacine Jernite, Carlos Muñoz Ferrandis, Lingming Zhang, Sean Hughes, Thomas Wolf, Arjun Guha, Leandro von Werra, and Harm de Vries. 1. **[SuperPoint](https://huggingface.co/docs/transformers/model_doc/superpoint)** (from MagicLeap) released with the paper [SuperPoint: Self-Supervised Interest Point Detection and Description](https://arxiv.org/abs/1712.07629) by Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich. 1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan. diff --git a/docs/source/en/_toctree.yml b/docs/source/en/_toctree.yml index 89529a375d..b820cd15ef 100644 --- a/docs/source/en/_toctree.yml +++ b/docs/source/en/_toctree.yml @@ -738,6 +738,8 @@ title: GroupViT - local: model_doc/idefics title: IDEFICS + - local: model_doc/idefics2 + title: Idefics2 - local: model_doc/instructblip title: InstructBLIP - local: model_doc/kosmos-2 diff --git a/docs/source/en/index.md b/docs/source/en/index.md index 701f0a7e6b..a5195f0f87 100644 --- a/docs/source/en/index.md +++ b/docs/source/en/index.md @@ -160,6 +160,7 @@ Flax), PyTorch, and/or TensorFlow. | [Hubert](model_doc/hubert) | ✅ | ✅ | ❌ | | [I-BERT](model_doc/ibert) | ✅ | ❌ | ❌ | | [IDEFICS](model_doc/idefics) | ✅ | ❌ | ❌ | +| [Idefics2](model_doc/idefics2) | ✅ | ❌ | ❌ | | [ImageGPT](model_doc/imagegpt) | ✅ | ❌ | ❌ | | [Informer](model_doc/informer) | ✅ | ❌ | ❌ | | [InstructBLIP](model_doc/instructblip) | ✅ | ❌ | ❌ | diff --git a/docs/source/en/model_doc/idefics2.md b/docs/source/en/model_doc/idefics2.md new file mode 100644 index 0000000000..5b91fcf38c --- /dev/null +++ b/docs/source/en/model_doc/idefics2.md @@ -0,0 +1,98 @@ + + +# Idefics2 + +## Overview + +The Idefics2 model was created by the [Hugging Face M4](https://huggingface.co/HuggingFaceM4) team and authored by Léo Tronchon, Hugo Laurencon, Victor Sanh. +The accompanying blog post can be found [here](https://huggingface.co/blog/idefics2). + +Idefics2 is an open multimodal model that accepts arbitrary sequences of image and text inputs and produces text +outputs. The model can answer questions about images, describe visual content, create stories grounded on multiple +images, or simply behave as a pure language model without visual inputs. It improves upon IDEFICS-1, notably on +document understanding, OCR, or visual reasoning. Idefics2 is lightweight (8 billion parameters) and treats +images in their native aspect ratio and resolution, which allows for varying inference efficiency. + +Tips: +- Each sample can contain multiple images, and the number of images can vary between samples. The processor will pad the inputs to the maximum number of images in a batch for input to the model. +- The processor has a `do_image_splitting` option. If `True`, each input image will be split into 4 sub-images, and concatenated with the original to form 5 images. This is useful for increasing model performance. Make sure `processor.image_processor.do_image_splitting` is set to `False` if the model was not trained with this option. +- `text` passed to the processor should have the `` tokens where the images should be inserted. And `` at the end of each utterance if the text is a chat message. +- The processor has its own `apply_chat_template` method to convert chat messages to text that can then be passed as `text` to the processor. + +Example of how to use the processor on chat messages: +```python +import requests +from PIL import Image +from transformers import Idefics2Processor, Idefics2ForConditionalGeneration + +url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg" +url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg" + +image_1 = Image.open(requests.get(url_1, stream=True).raw) +image_2 = Image.open(requests.get(url_2, stream=True).raw) +images = [image_1, image_2] + +messages = [{ + "role": "user", + "content": [ + {"type": "text", "text": "What’s the difference between these two images?"}, + {"type": "image"}, + {"type": "image"}, + ], +}] + +processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b") +model = Idefics2ForConditionalGeneration.from_pretrained("HuggingFaceM4/idefics2-8b") + +text = processor.apply_chat_template(messages) +# "User: What’s the difference between these two images?\n" +print(text) + +inputs = processor(images=images, text=text) + +generated_text = model.generate(**inputs) +``` + +This model was contributed by [amyeroberts](https://huggingface.co/amyeroberts). +The original code can be found [here](https://huggingface.co/HuggingFaceM4/idefics2). + + +## Idefics2Config + +[[autodoc]] Idefics2Config + + +## Idefics2Model + +[[autodoc]] Idefics2Model + - forward + + +## Idefics2ForConditionalGeneration + +[[autodoc]] Idefics2ForConditionalGeneration + - forward + + +## Idefics2ImageProcessor +[[autodoc]] Idefics2ImageProcessor + - preprocess + + +## Idefics2Processor +[[autodoc]] Idefics2Processor + - __call__ diff --git a/docs/source/en/perf_infer_gpu_one.md b/docs/source/en/perf_infer_gpu_one.md index 5683f1e78b..f46bf931cc 100644 --- a/docs/source/en/perf_infer_gpu_one.md +++ b/docs/source/en/perf_infer_gpu_one.md @@ -47,6 +47,7 @@ FlashAttention-2 is currently supported for the following architectures: * [GPTNeo](https://huggingface.co/docs/transformers/model_doc/gpt_neo#transformers.GPTNeoModel) * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel) * [GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj#transformers.GPTJModel) +* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model) * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel) * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel) * [Llava](https://huggingface.co/docs/transformers/model_doc/llava) @@ -96,8 +97,8 @@ model_id = "tiiuae/falcon-7b" tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( - model_id, - torch_dtype=torch.bfloat16, + model_id, + torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2", ) ``` @@ -109,7 +110,7 @@ FlashAttention-2 can only be used when the model's dtype is `fp16` or `bf16`. Ma
You can also set `use_flash_attention_2=True` to enable FlashAttention-2 but it is deprecated in favor of `attn_implementation="flash_attention_2"`. - +
FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization: @@ -123,14 +124,14 @@ tokenizer = AutoTokenizer.from_pretrained(model_id) # load in 8bit model = AutoModelForCausalLM.from_pretrained( - model_id, + model_id, load_in_8bit=True, attn_implementation="flash_attention_2", ) # load in 4bit model = AutoModelForCausalLM.from_pretrained( - model_id, + model_id, load_in_4bit=True, attn_implementation="flash_attention_2", ) diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 666c5f980a..0927e696a1 100644 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -507,6 +507,7 @@ _import_structure = { "IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP", "IdeficsConfig", ], + "models.idefics2": ["Idefics2Config"], "models.imagegpt": ["IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP", "ImageGPTConfig"], "models.informer": ["INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP", "InformerConfig"], "models.instructblip": [ @@ -1335,6 +1336,7 @@ else: _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"]) _import_structure["models.grounding_dino"].extend(["GroundingDinoImageProcessor"]) _import_structure["models.idefics"].extend(["IdeficsImageProcessor"]) + _import_structure["models.idefics2"].extend(["Idefics2ImageProcessor"]) _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"]) _import_structure["models.layoutlmv2"].extend(["LayoutLMv2FeatureExtractor", "LayoutLMv2ImageProcessor"]) _import_structure["models.layoutlmv3"].extend(["LayoutLMv3FeatureExtractor", "LayoutLMv3ImageProcessor"]) @@ -2441,6 +2443,15 @@ else: "IdeficsProcessor", ] ) + _import_structure["models.idefics2"].extend( + [ + "IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST", + "Idefics2ForConditionalGeneration", + "Idefics2Model", + "Idefics2PreTrainedModel", + "Idefics2Processor", + ] + ) _import_structure["models.imagegpt"].extend( [ "IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -5410,6 +5421,7 @@ if TYPE_CHECKING: IDEFICS_PRETRAINED_CONFIG_ARCHIVE_MAP, IdeficsConfig, ) + from .models.idefics2 import Idefics2Config from .models.imagegpt import IMAGEGPT_PRETRAINED_CONFIG_ARCHIVE_MAP, ImageGPTConfig from .models.informer import INFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, InformerConfig from .models.instructblip import ( @@ -6210,6 +6222,7 @@ if TYPE_CHECKING: from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor from .models.grounding_dino import GroundingDinoImageProcessor from .models.idefics import IdeficsImageProcessor + from .models.idefics2 import Idefics2ImageProcessor from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor from .models.layoutlmv2 import ( LayoutLMv2FeatureExtractor, @@ -7163,6 +7176,13 @@ if TYPE_CHECKING: IdeficsPreTrainedModel, IdeficsProcessor, ) + from .models.idefics2 import ( + IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST, + Idefics2ForConditionalGeneration, + Idefics2Model, + Idefics2PreTrainedModel, + Idefics2Processor, + ) from .models.imagegpt import ( IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST, ImageGPTForCausalImageModeling, diff --git a/src/transformers/image_transforms.py b/src/transformers/image_transforms.py index b3a25a8be8..016fae4405 100644 --- a/src/transformers/image_transforms.py +++ b/src/transformers/image_transforms.py @@ -749,7 +749,6 @@ def convert_to_rgb(image: ImageInput) -> ImageInput: """ Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image as is. - Args: image (Image): The image to convert. @@ -759,6 +758,9 @@ def convert_to_rgb(image: ImageInput) -> ImageInput: if not isinstance(image, PIL.Image.Image): return image + if image.mode == "RGB": + return image + image = image.convert("RGB") return image diff --git a/src/transformers/image_utils.py b/src/transformers/image_utils.py index a7e53b3fe7..e4a55b3455 100644 --- a/src/transformers/image_utils.py +++ b/src/transformers/image_utils.py @@ -311,7 +311,7 @@ def load_image(image: Union[str, "PIL.Image.Image"], timeout: Optional[float] = if image.startswith("http://") or image.startswith("https://"): # We need to actually check for a real protocol, otherwise it's impossible to use a local file # like http_huggingface_co.png - image = PIL.Image.open(requests.get(image, stream=True, timeout=timeout).raw) + image = PIL.Image.open(BytesIO(requests.get(image, timeout=timeout).content)) elif os.path.isfile(image): image = PIL.Image.open(image) else: diff --git a/src/transformers/models/__init__.py b/src/transformers/models/__init__.py index f6db5e164b..01dc84400d 100644 --- a/src/transformers/models/__init__.py +++ b/src/transformers/models/__init__.py @@ -111,6 +111,7 @@ from . import ( hubert, ibert, idefics, + idefics2, imagegpt, informer, instructblip, diff --git a/src/transformers/models/auto/configuration_auto.py b/src/transformers/models/auto/configuration_auto.py index e5dd68c189..2654c45119 100755 --- a/src/transformers/models/auto/configuration_auto.py +++ b/src/transformers/models/auto/configuration_auto.py @@ -125,6 +125,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ("hubert", "HubertConfig"), ("ibert", "IBertConfig"), ("idefics", "IdeficsConfig"), + ("idefics2", "Idefics2Config"), ("imagegpt", "ImageGPTConfig"), ("informer", "InformerConfig"), ("instructblip", "InstructBlipConfig"), @@ -283,6 +284,7 @@ CONFIG_MAPPING_NAMES = OrderedDict( ] ) + MODEL_NAMES_MAPPING = OrderedDict( [ # Add full (and cased) model names here @@ -390,6 +392,7 @@ MODEL_NAMES_MAPPING = OrderedDict( ("hubert", "Hubert"), ("ibert", "I-BERT"), ("idefics", "IDEFICS"), + ("idefics2", "Idefics2"), ("imagegpt", "ImageGPT"), ("informer", "Informer"), ("instructblip", "InstructBLIP"), diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py index 6ae28bfa32..c8538a9a55 100644 --- a/src/transformers/models/auto/image_processing_auto.py +++ b/src/transformers/models/auto/image_processing_auto.py @@ -71,6 +71,7 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( ("grounding-dino", "GroundingDinoImageProcessor"), ("groupvit", "CLIPImageProcessor"), ("idefics", "IdeficsImageProcessor"), + ("idefics2", "Idefics2ImageProcessor"), ("imagegpt", "ImageGPTImageProcessor"), ("instructblip", "BlipImageProcessor"), ("kosmos-2", "CLIPImageProcessor"), diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index cf45f4a8d3..8341251f4f 100755 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -120,6 +120,7 @@ MODEL_MAPPING_NAMES = OrderedDict( ("hubert", "HubertModel"), ("ibert", "IBertModel"), ("idefics", "IdeficsModel"), + ("idefics2", "Idefics2Model"), ("imagegpt", "ImageGPTModel"), ("informer", "InformerModel"), ("jukebox", "JukeboxModel"), @@ -287,6 +288,7 @@ MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), ("ibert", "IBertForMaskedLM"), ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), ("layoutlm", "LayoutLMForMaskedLM"), ("llava", "LlavaForConditionalGeneration"), ("llava_next", "LlavaNextForConditionalGeneration"), @@ -678,6 +680,7 @@ MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( ("blip", "BlipForConditionalGeneration"), ("blip-2", "Blip2ForConditionalGeneration"), ("git", "GitForCausalLM"), + ("idefics2", "Idefics2ForConditionalGeneration"), ("instructblip", "InstructBlipForConditionalGeneration"), ("kosmos-2", "Kosmos2ForConditionalGeneration"), ("llava", "LlavaForConditionalGeneration"), diff --git a/src/transformers/models/auto/processing_auto.py b/src/transformers/models/auto/processing_auto.py index 5a654c3d02..a7134f26a7 100644 --- a/src/transformers/models/auto/processing_auto.py +++ b/src/transformers/models/auto/processing_auto.py @@ -61,6 +61,7 @@ PROCESSOR_MAPPING_NAMES = OrderedDict( ("groupvit", "CLIPProcessor"), ("hubert", "Wav2Vec2Processor"), ("idefics", "IdeficsProcessor"), + ("idefics2", "Idefics2Processor"), ("instructblip", "InstructBlipProcessor"), ("kosmos-2", "Kosmos2Processor"), ("layoutlmv2", "LayoutLMv2Processor"), diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index d23ce040a9..114521e91a 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -201,6 +201,7 @@ else: ("hubert", ("Wav2Vec2CTCTokenizer", None)), ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), ("jukebox", ("JukeboxTokenizer", None)), ( diff --git a/src/transformers/models/idefics/modeling_idefics.py b/src/transformers/models/idefics/modeling_idefics.py index 47024d24e6..a01c2279c1 100644 --- a/src/transformers/models/idefics/modeling_idefics.py +++ b/src/transformers/models/idefics/modeling_idefics.py @@ -1458,18 +1458,27 @@ class IdeficsForVisionText2Text(IdeficsPreTrainedModel): Example: ```python - >>> from transformers import AutoTokenizer, IdeficsForVisionText2Text + >>> from transformers import AutoProcessor, IdeficsForVisionText2Text >>> model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b") - >>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceM4/idefics-9b") + >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b") - >>> prompt = "Hey, are you consciours? Can you talk to me?" - >>> inputs = tokenizer(prompt, return_tensors="pt") + >>> dogs_image_url_1 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image1.jpeg" + >>> dogs_image_url_2 = "https://huggingface.co/datasets/hf-internal-testing/fixtures_nlvr2/raw/main/image2.jpeg" - >>> # Generate - >>> generate_ids = model.generate(inputs.input_ids, max_length=30) - >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] - "Hey, are you consciours? Can you talk to me?\nI'm not consciours, but I can talk to you." + >>> prompts = [ + ... [ + ... "User:", + ... dogs_image_url_1, + ... "Describe this image.\nAssistant: An image of two dogs.\n", + ... "User:", + ... dogs_image_url_2, + ... "Describe this image.\nAssistant:", + ... ] + ... ] + >>> inputs = processor(prompts, return_tensors="pt") + >>> generate_ids = model.generate(**inputs, max_new_tokens=6) + >>> processor.batch_decode(generate_ids, skip_special_tokens=True) ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions diff --git a/src/transformers/models/idefics2/__init__.py b/src/transformers/models/idefics2/__init__.py new file mode 100644 index 0000000000..3b1996ef95 --- /dev/null +++ b/src/transformers/models/idefics2/__init__.py @@ -0,0 +1,74 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available + + +_import_structure = {"configuration_idefics2": ["Idefics2Config"]} + + +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["image_processing_idefics2"] = ["Idefics2ImageProcessor"] + + +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + pass +else: + _import_structure["modeling_idefics2"] = [ + "IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST", + "Idefics2ForConditionalGeneration", + "Idefics2PreTrainedModel", + "Idefics2Model", + ] + _import_structure["processing_idefics2"] = ["Idefics2Processor"] + +if TYPE_CHECKING: + from .configuration_idefics2 import Idefics2Config + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .image_processing_idefics2 import Idefics2ImageProcessor + + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + pass + else: + from .modeling_idefics2 import ( + IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST, + Idefics2ForConditionalGeneration, + Idefics2Model, + Idefics2PreTrainedModel, + ) + from .processing_idefics2 import Idefics2Processor + + +else: + import sys + + sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) diff --git a/src/transformers/models/idefics2/configuration_idefics2.py b/src/transformers/models/idefics2/configuration_idefics2.py new file mode 100644 index 0000000000..1856bdbccb --- /dev/null +++ b/src/transformers/models/idefics2/configuration_idefics2.py @@ -0,0 +1,262 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Idefics2 model configuration""" + +import os +from typing import Union + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING + + +logger = logging.get_logger(__name__) + + +class Idefics2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Idefics2VisionModel`]. It is used to instantiate a + Idefics2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the SigLIP checkpoint + [google/siglip-base-patch16-224](https://huggingface.co/google/siglip-base-patch16-224) used in the Idefics2 model + [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + Number of channels in the input images. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + intializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation for initializing all weight matrices in the model. + + Example: + + ```python + >>> from transformers.models.idefics2.modeling_idefics2 import Idefics2VisionTransformer + >>> from transformers.models.idefics2.configuration_idefics2 import Idefics2VisionConfig + + >>> # Initializing a Idefics2VisionConfig with google/siglip-base-patch16-224 style configuration + >>> configuration = Idefics2VisionConfig() + + >>> # Initializing a Idefics2VisionTransformer (with random weights) from the google/siglip-base-patch16-224 style configuration + >>> model = Idefics2VisionTransformer(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "idefics2" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="gelu_pytorch_tanh", + layer_norm_eps=1e-6, + attention_dropout=0.0, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": + cls._set_token_in_kwargs(kwargs) + + config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) + + # get the vision config dict if we are loading from Idefics2Config + if config_dict.get("model_type") == "idefics2": + config_dict = config_dict["vision_config"] + + if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: + logger.warning( + f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " + f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." + ) + + return cls.from_dict(config_dict, **kwargs) + + +class Idefics2PerceiverConfig(PretrainedConfig): + r""" + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the perceiver block. + resampler_n_latents (`int`, *optional*, defaults to 64): + Number of latent embeddings to resample ("compress") the input sequence to (usually < 128). + resampler_depth (`int`, *optional*, defaults to 3): + Depth of the Perceiver Resampler (Transformer w/ cross attention). Should be shallow (<= 3). + resampler_n_heads (`int`, *optional*, defaults to 16): + Number of heads in each Transformer block (for multi-headed self-attention). + resampler_head_dim (`int`, *optional*, defaults to 96): + Dimensionality of each head projection in the Transformer block. + num_key_value_heads (`int`, *optional*, defaults to 4): + Number of key-value heads in the perceiver attention block. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + """ + + model_type = "idefics2" + + def __init__( + self, + hidden_act="silu", + resampler_n_latents=64, + resampler_depth=3, + resampler_n_heads=16, + resampler_head_dim=96, + num_key_value_heads=4, + attention_dropout=0.0, + **kwargs, + ): + self.hidden_act = hidden_act + self.resampler_n_latents = resampler_n_latents + self.resampler_depth = resampler_depth + self.resampler_n_heads = resampler_n_heads + self.num_key_value_heads = num_key_value_heads + self.resampler_head_dim = resampler_head_dim + self.attention_dropout = attention_dropout + if self.num_key_value_heads > self.resampler_n_heads: + raise ValueError( + f"num_key_value_heads={self.num_key_value_heads} must be less than or equal to" + f" resampler_n_heads={self.resampler_n_heads}" + ) + super().__init__(**kwargs) + + +class Idefics2Config(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Idefics2Model`]. It is used to instantiate a + Idefics2 model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the model of the Idefics2 + [HuggingFaceM4/idefics2-8b](https://huggingface.co/HuggingFaceM4/idefics2-8b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should cache the key/value pairs of the attention mechanism. + image_token_id (`int`, *optional*, defaults to 32001): + The id of the "image" token. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to tie the word embeddings with the token embeddings. + vision_config (`IdeficsVisionConfig` or `dict`, *optional*): + Custom vision config or dict + perceiver_config (`IdeficsPerceiverConfig` or `dict`, *optional*): + Custom perceiver config or dict + text_config (`MistralConfig` or `dict`, *optional*): + Custom text config or dict for the text model + + Example: + ```python + >>> from transformers import Idefics2Model, Idefics2Config + >>> # Initializing configuration + >>> configuration = Idefics2Config() + >>> # Initializing a model from the configuration + >>> model = Idefics2Model(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "idefics2" + is_composition = True + + def __init__( + self, + use_cache=True, + image_token_id=32_001, + tie_word_embeddings=False, + vision_config=None, + perceiver_config=None, + text_config=None, + **kwargs, + ): + self.image_token_id = image_token_id + self.use_cache = use_cache + self.tie_word_embeddings = tie_word_embeddings + + if perceiver_config is None: + self.perceiver_config = Idefics2PerceiverConfig() + logger.info("perciver_config is None, using default perceiver config") + elif isinstance(perceiver_config, dict): + self.perceiver_config = Idefics2PerceiverConfig(**perceiver_config) + elif isinstance(perceiver_config, Idefics2PerceiverConfig): + self.perceiver_config = perceiver_config + + if vision_config is None: + self.vision_config = Idefics2VisionConfig() + logger.info("vision_config is None, using default vision config") + elif isinstance(vision_config, dict): + self.vision_config = Idefics2VisionConfig(**vision_config) + elif isinstance(vision_config, Idefics2VisionConfig): + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "mistral" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + logger.info("text_config is None, using default text config") + text_config = CONFIG_MAPPING["mistral"]( + max_position_embeddings=4096 * 8, + rms_norm_eps=1e-5, + # None in the original configuration_mistral, we set it to the unk_token_id + pad_token_id=0, + tie_word_embeddings=False, + ) + + self.text_config = text_config + + super().__init__(**kwargs, tie_word_embeddings=tie_word_embeddings) diff --git a/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py new file mode 100644 index 0000000000..ea44ee11e5 --- /dev/null +++ b/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py @@ -0,0 +1,185 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import copy + +import torch +from accelerate import init_empty_weights + +from transformers import ( + AutoConfig, + AutoModelForCausalLM, + AutoTokenizer, + Idefics2Config, + Idefics2ForConditionalGeneration, + Idefics2ImageProcessor, + Idefics2Processor, + MistralConfig, +) + + +EPILOG_TXT = """Example: + python transformers/src/transformers/models/idefics2/convert_idefics2_weights_to_hf.py --original_model_id HuggingFaceM4/idefics2-8b --output_hub_path org/idefics2 +""" + + +KEYS_TO_MODIFY_MAPPING = { + "lm_head.weight": "lm_head.linear.weight", + "model.layers": "model.text_model.layers", + "model.norm": "model.text_model.norm", + "model.perceiver_resampler": "model.connector.perceiver_resampler", + "model.modality_projection": "model.connector.modality_projection", +} + + +WEIGHTS_TO_MERGE_MAPPING = ( + # (weights to merge in merging order), (new weight name) + ( + ("model.embed_tokens.weight", "model.embed_tokens.additional_embedding.weight"), + "model.text_model.embed_tokens.weight", + ), + (("lm_head.linear.weight", "additional_fc.weight"), "lm_head.weight"), +) + + +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value + return new_state_dict + + +def merge_weights(state_dict): + new_state_dict = copy.deepcopy(state_dict) + + # Merge the weights + for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: + for weight in weights_to_merge: + assert weight in state_dict, f"Weight {weight} is missing in the state dict" + if new_weight_name not in new_state_dict: + new_state_dict[new_weight_name] = [state_dict[weight]] + else: + new_state_dict[new_weight_name].append(state_dict[weight]) + new_state_dict[new_weight_name] = torch.cat(new_state_dict[new_weight_name], dim=0) + + # Remove the weights that were merged + for weights_to_merge, new_weight_name in WEIGHTS_TO_MERGE_MAPPING: + for weight in weights_to_merge: + if weight in new_state_dict and weight != new_weight_name: + new_state_dict.pop(weight) + + return new_state_dict + + +def get_config(checkpoint): + if checkpoint == "HuggingFaceM4/idefics2": + # We load the config then recreate to use the text_config + config = AutoConfig.from_pretrained(checkpoint) + text_config = MistralConfig( + vocab_size=config.vocab_size + config.additional_vocab_size, + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + num_hidden_layers=config.num_hidden_layers, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + hidden_act=config.hidden_act, + max_position_embeddings=config.max_position_embeddings, + initializer_range=config.initializer_range, + rms_norm_eps=config.rms_norm_eps, + tie_word_embeddings=config.tie_word_embeddings, + rope_theta=config.rope_theta, + sliding_window=config.sliding_window, + attention_dropout=config.attention_dropout, + pad_token_id=config.pad_token_id, + bos_token_id=config.bos_token_id, + eos_token_id=config.eos_token_id, + ) + perceiver_config = config.perceiver_config.to_dict() + config = Idefics2Config( + text_config=text_config.to_dict(), + vision_config=config.vision_config, + perceiver_config=perceiver_config, + use_cache=config.use_cache, + image_token_id=config.image_token_id, + tie_word_embeddings=config.tie_word_embeddings, + ) + return config + + return AutoConfig.from_pretrained(checkpoint) + + +def convert_idefics2_hub_to_hf(original_model_id, output_hub_path, push_to_hub): + # The original model maps to AutoModelForCausalLM, converted we map to Idefics2ForConditionalGeneration + original_model = AutoModelForCausalLM.from_pretrained(original_model_id, trust_remote_code=True) + # The original model doesn't use the idefics2 processing objects + image_seq_len = original_model.config.perceiver_config.resampler_n_latents + image_processor = Idefics2ImageProcessor() + tokenizer = AutoTokenizer.from_pretrained(original_model_id) + processor = Idefics2Processor( + image_processor=image_processor, + tokenizer=tokenizer, + image_seq_len=image_seq_len, + ) + state_dict = original_model.state_dict() + state_dict = convert_state_dict_to_hf(state_dict) + + # Merge weights + state_dict = merge_weights(state_dict) + + config = get_config(original_model_id) + + with init_empty_weights(): + model = Idefics2ForConditionalGeneration(config) + + model.load_state_dict(state_dict, strict=True, assign=True) + + model.save_pretrained(output_hub_path) + processor.save_pretrained(output_hub_path) + + if push_to_hub: + model.push_to_hub(output_hub_path, private=True) + processor.push_to_hub(output_hub_path, private=True) + + +def main(): + parser = argparse.ArgumentParser( + epilog=EPILOG_TXT, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--original_model_id", + help="Hub location of the text model", + ) + parser.add_argument( + "--output_hub_path", + help="Location on the hub of the converted model", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="If set, the model will be pushed to the hub after conversion.", + ) + args = parser.parse_args() + convert_idefics2_hub_to_hf(args.original_model_id, args.output_hub_path, args.push_to_hub) + + +if __name__ == "__main__": + main() diff --git a/src/transformers/models/idefics2/image_processing_idefics2.py b/src/transformers/models/idefics2/image_processing_idefics2.py new file mode 100644 index 0000000000..ac9df68871 --- /dev/null +++ b/src/transformers/models/idefics2/image_processing_idefics2.py @@ -0,0 +1,596 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature +from ...image_transforms import PaddingMode, pad, resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + is_valid_image, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + from PIL import Image + + +def get_resize_output_image_size(image, size, input_data_format) -> Tuple[int, int]: + """ + Get the output size of the image after resizing given a dictionary specifying the max and min sizes. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image containing the keys "shortest_edge" and "longest_edge". + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + The output size of the image after resizing. + """ + height, width = get_image_size(image, channel_dim=input_data_format) + + min_len = size["shortest_edge"] + max_len = size["longest_edge"] + aspect_ratio = width / height + + if width >= height and width > max_len: + width = max_len + height = int(width / aspect_ratio) + elif height > width and height > max_len: + height = max_len + width = int(height * aspect_ratio) + height = max(height, min_len) + width = max(width, min_len) + return height, width + + +def make_list_of_images(images: ImageInput) -> List[List[np.ndarray]]: + """ + Convert a single image or a list of images to a list of numpy arrays. + + Args: + images (`ImageInput`): + A single image or a list of images. + + Returns: + A list of numpy arrays. + """ + # If it's a single image, convert it to a list of lists + if is_valid_image(images): + images = [[images]] + # If it's a list of images, it's a single batch, so convert it to a list of lists + elif isinstance(images, (list, tuple)) and len(images) > 0 and is_valid_image(images[0]): + images = [images] + # If it's a list of batches, it's already in the right format + elif ( + isinstance(images, (list, tuple)) + and len(images) > 0 + and isinstance(images[0], (list, tuple)) + and is_valid_image(images[0][0]) + ): + pass + else: + raise ValueError( + "Invalid input type. Must be a single image, a list of images, or a list of batches of images." + ) + return images + + +# Copied from transformers.models.detr.image_processing_detr.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +def get_max_height_width( + images_list: List[List[np.ndarray]], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images_list[0][0]) + + image_sizes = [] + for images in images_list: + for image in images: + image_sizes.append(get_image_size(image, channel_dim=input_data_format)) + + max_height, max_width = max_across_indices(image_sizes) + return (max_height, max_width) + + +# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# FIXME Amy: merge this function with the one in image_transforms.py +def convert_to_rgb(image: ImageInput) -> ImageInput: + """ + Converts an image to RGB format. Only converts if the image is of type PIL.Image.Image, otherwise returns the image + as is. + Args: + image (Image): + The image to convert. + """ + if not isinstance(image, PIL.Image.Image): + return image + + # `image.convert("RGB")` would only work for .jpg images, as it creates a wrong background + # for transparent images. The call to `alpha_composite` handles this case + if image.mode == "RGB": + return image + + image_rgba = image.convert("RGBA") + background = Image.new("RGBA", image_rgba.size, (255, 255, 255)) + alpha_composite = Image.alpha_composite(background, image_rgba) + alpha_composite = alpha_composite.convert("RGB") + return alpha_composite + + +class Idefics2ImageProcessor(BaseImageProcessor): + r""" + Constructs a Idefics image processor. + + Args: + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. This is useful if the input image is of a different format e.g. RGBA. + Only has an effect if the input image is in the PIL format. + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image. The longest edge of the image is resized to be <= `size["longest_edge"]`, with the + shortest edge resized to keep the input aspect ratio, with a minimum size of `size["shortest_edge"]`. + size (`Dict`, *optional*): + Controls the size of the output image. This is a dictionary containing the keys "shortest_edge" and "longest_edge". + resample (`Resampling`, *optional*, defaults to `Resampling.BILINEAR`): + Resampling filter to use when resizing the image. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image. If set to `True`, the image is rescaled to have pixel values between 0 and 1. + rescale_factor (`float`, *optional*, defaults to `1/255`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. If set to `True`, the image is normalized to have a mean of `image_mean` and + a standard deviation of `image_std`. + image_mean (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IDEFICS_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether or not to pad the images to the largest height and width in the batch and number of images per + sample in the batch, such that the returned tensor is of shape (batch_size, max_num_images, num_channels, max_height, max_width). + do_image_splitting (`bool`, *optional*, defaults to `False`): + Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That + strategy was first introduced in https://arxiv.org/abs/2311.06607. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_convert_rgb: bool = True, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BILINEAR, + do_rescale: bool = True, + rescale_factor: float = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: bool = True, + do_image_splitting: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + self.do_convert_rgb = do_convert_rgb + self.do_resize = do_resize + self.size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980} + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_pad = do_pad + self.do_image_splitting = do_image_splitting + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BILINEAR, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + if "shortest_edge" in size and "longest_edge" in size: + size = get_resize_output_image_size(image, size, input_data_format) + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError( + "size must be a dictionary with keys 'shortest_edge' and 'longest_edge' or 'height' and 'width'." + ) + return resize( + image, size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs + ) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + def pad( + self, + images: List[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + For a list of images, for each images, pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width. + For each sample in the batch, pads the sample with empty images to the max_number of images per sample in the batch. Optionally returns a pixel mask. + + Args: + images (`np.ndarray`): + List of list of images to pad. Pads to the largest height and width in the batch. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + batch_size = len(images) + max_num_images = max(len(images_) for images_ in images) + input_data_format = ( + infer_channel_dimension_format(images[0][0]) if input_data_format is None else input_data_format + ) + data_format = input_data_format if data_format is None else data_format + + def empty_image(size, input_data_format): + if input_data_format == ChannelDimension.FIRST: + return np.zeros((3, *size), dtype=np.uint8) + elif input_data_format == ChannelDimension.LAST: + return np.zeros((*size, 3), dtype=np.uint8) + raise ValueError("Invalid channel dimension format.") + + padded_images_list = [ + [empty_image(pad_size, data_format) for _ in range(max_num_images)] for _ in range(batch_size) + ] + padded_masks = [[np.zeros(pad_size) for _ in range(max_num_images)] for _ in range(batch_size)] + + for batch_idx in range(batch_size): + for sample_idx, image in enumerate(images[batch_idx]): + padded_images_list[batch_idx][sample_idx] = self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + padded_masks[batch_idx][sample_idx] = make_pixel_mask( + image, output_size=pad_size, input_data_format=input_data_format + ) + + padded_masks = padded_masks if return_pixel_mask else None + return padded_images_list, padded_masks + + def _crop( + self, + im: np.ndarray, + w1: int, + h1: int, + w2: int, + h2: int, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + if input_data_format == ChannelDimension.FIRST: + return im[:, h1:h2, w1:w2] + elif input_data_format == ChannelDimension.LAST: + return im[h1:h2, w1:w2, :] + + def split_image( + self, + image: np.ndarray, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Split an image into 4 equal sub-images, and the concatenate that sequence with the original image. + That means that a single image becomes a sequence of 5 images. + This is a "trick" to spend more compute on each image with no changes in the vision encoder. + + Args: + image (`np.ndarray`): + Images to split. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + height, width = get_image_size(image, input_data_format) + + mid_width = width // 2 + mid_height = height // 2 + return [ + self._crop(image, 0, 0, mid_width, mid_height, input_data_format), + self._crop(image, mid_width, 0, width, mid_height, input_data_format), + self._crop(image, 0, mid_height, mid_width, height, input_data_format), + self._crop(image, mid_width, mid_height, width, height, input_data_format), + image, + ] + + def preprocess( + self, + images: ImageInput, + do_convert_rgb: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + do_image_splitting: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + input_data_format: Optional[ChannelDimension] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + ): + """ + Preprocess a batch of images. + + Args: + images (`ImageInput`): + A list of images to preprocess. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether or not to pad the images to the largest height and width in the batch. + do_image_splitting (`bool`, *optional*, defaults to `self.do_image_splitting`): + Whether to split the image into a sequence 4 equal sub-images concatenated with the original image. That + strategy was first introduced in https://arxiv.org/abs/2311.06607. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_pad = do_pad if do_pad is not None else self.do_pad + do_image_splitting = do_image_splitting if do_image_splitting is not None else self.do_image_splitting + + images_list = make_list_of_images(images) + + if not valid_images(images_list[0]): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images_list = [[convert_to_rgb(image) for image in images] for images in images_list] + + # All transformations expect numpy arrays. + images_list = [[to_numpy_array(image) for image in images] for images in images_list] + + if is_scaled_image(images_list[0][0]) and do_rescale: + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images_list[0][0]) + + if do_image_splitting: + new_images_list = [] + for images in images_list: + new_images = [] + for image in images: + new_images.extend(self.split_image(image, input_data_format)) + new_images_list.append(new_images) + images_list = new_images_list + + if do_resize: + images_list = [ + [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + for images in images_list + ] + + if do_rescale: + images_list = [ + [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + for images in images_list + ] + + if do_normalize: + images_list = [ + [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + for images in images_list + ] + + pixel_attention_mask = None + if do_pad: + images_list, pixel_attention_mask = self.pad( + images_list, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=input_data_format + ) + + if data_format is not None: + images_list = [ + [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in images + ] + for images in images_list + ] + + data = {"pixel_values": np.array(images_list) if do_pad else images_list} # Faster tensor conversion + if pixel_attention_mask is not None: + data["pixel_attention_mask"] = np.array(pixel_attention_mask) if do_pad else pixel_attention_mask + + return BatchFeature(data=data, tensor_type=return_tensors) diff --git a/src/transformers/models/idefics2/modeling_idefics2.py b/src/transformers/models/idefics2/modeling_idefics2.py new file mode 100644 index 0000000000..32d707ae66 --- /dev/null +++ b/src/transformers/models/idefics2/modeling_idefics2.py @@ -0,0 +1,1959 @@ +# coding=utf-8 +# Copyright 2024 the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Idefics2 model.""" + +import inspect +import math +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +import torch +import torch.nn.functional as F +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ... import PreTrainedModel +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import BaseModelOutput, ModelOutput +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_flash_attn_2_available, + is_flash_attn_greater_or_equal_2_10, + logging, + replace_return_docstrings, +) +from ..auto import AutoModel +from .configuration_idefics2 import Idefics2Config, Idefics2VisionConfig + + +if is_flash_attn_2_available(): + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "Idefics2Config" + +IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST = [ + "HuggingFaceM4/idefics2-8b", + # See all IDEFICS2 models at https://huggingface.co/models?filter=idefics2 +] + + +@dataclass +class Idefics2BaseModelOutputWithPast(ModelOutput): + """ + Base class for Idefics2 model's outputs that may also contain a past key/values (to speed up sequential decoding). + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + last_hidden_state: torch.FloatTensor = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +# Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Idefics2 +class Idefics2CausalLMOutputWithPast(ModelOutput): + """ + Base class for Idefics2 causal language model (or autoregressive) outputs. + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`tuple(torch.FloatTensor)`, *optional*): + Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images, + sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver + """ + + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +class Idefics2VisionEmbeddings(nn.Module): + """ + This is a modified version of `siglip.modelign_siglip.SiglipVisionEmbeddings` to enable images of variable + resolution. + + The modifications are adapted from [Patch n' Pack: NaViT, a Vision Transformer for any Aspect Ratio and Resolution](https://arxiv.org/abs/2307.06304) + which allows treating images in their native aspect ratio and without the need to resize them to the same + fixed size. In particular, we start from the original pre-trained SigLIP model + (which uses images of fixed-size square images) and adapt it by training on images of variable resolutions. + """ + + def __init__(self, config: Idefics2VisionConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + padding="valid", + ) + + self.num_patches_per_side = self.image_size // self.patch_size + self.num_patches = self.num_patches_per_side**2 + self.num_positions = self.num_patches + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + + def forward(self, pixel_values: torch.FloatTensor, patch_attention_mask: torch.BoolTensor) -> torch.Tensor: + batch_size, _, max_im_h, max_im_w = pixel_values.shape + + patch_embeds = self.patch_embedding(pixel_values) + embeddings = patch_embeds.flatten(2).transpose(1, 2) + + max_nb_patches_h, max_nb_patches_w = max_im_h // self.patch_size, max_im_w // self.patch_size + boundaries = torch.arange(1 / self.num_patches_per_side, 1.0, 1 / self.num_patches_per_side) + position_ids = torch.full(size=(batch_size, max_nb_patches_h * max_nb_patches_w), fill_value=0) + + for batch_idx, p_attn_mask in enumerate(patch_attention_mask): + nb_patches_h = p_attn_mask[:, 0].sum() + nb_patches_w = p_attn_mask[0].sum() + + fractional_coords_h = torch.arange(0, 1 - 1e-6, 1 / nb_patches_h) + fractional_coords_w = torch.arange(0, 1 - 1e-6, 1 / nb_patches_w) + + bucket_coords_h = torch.bucketize(fractional_coords_h, boundaries, right=True) + bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True) + + pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten() + position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids + + position_ids = position_ids.to(self.position_embedding.weight.device) + embeddings = embeddings + self.position_embedding(position_ids) + return embeddings + + +# Copied from transformers.models.siglip.modeling_siglip.SiglipAttention with Siglip->Idefics2Vision +class Idefics2VisionAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + # Copied from transformers.models.clip.modeling_clip.CLIPAttention.__init__ + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + # Ignore copy + self.is_causal = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + batch_size, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + k_v_seq_len = key_states.shape[-2] + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale + + if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len): + raise ValueError( + f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): + raise ValueError( + f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class Idefics2VisionFlashAttention2(Idefics2VisionAttention): + """ + Idefics2Vision flash attention module. This module inherits from `Idefics2VisionAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + output_attentions = False + + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim + # therefore we just need to keep the original shape + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx) + + # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache + # to be able to avoid many of these transpose/reshape/view. + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + dropout_rate = self.dropout if self.training else 0.0 + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (Idefics2VisionRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = self._flash_attention_forward( + query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate + ) + + attn_output = attn_output.reshape(bsz, q_len, self.embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._flash_attention_forward + def _flash_attention_forward( + self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal + ) + + return attn_output + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2._upad_input + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis( + key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +IDEFICS_VISION_ATTENTION_CLASSES = { + "eager": Idefics2VisionAttention, + "flash_attention_2": Idefics2VisionFlashAttention2, +} + + +# Copied from transformers.models.siglip.modeling_siglip.SiglipMLP with Siglip->Idefics2Vision +class Idefics2VisionMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class Idefics2MLP(nn.Module): + def __init__( + self, + hidden_size: int, + intermediate_size: int, + output_size: int, + hidden_act: str, + ): + super().__init__() + self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False) + self.down_proj = nn.Linear(intermediate_size, output_size, bias=False) + self.act_fn = ACT2FN[hidden_act] + + def forward(self, x): + return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + + +# Copied from transformers.models.siglip.modeling_siglip.SiglipMultiheadAttentionPoolingHead with Siglip->Idefics2 +class Idefics2MultiheadAttentionPoolingHead(nn.Module): + """Multihead Attention Pooling.""" + + def __init__(self, config: Idefics2VisionConfig): + super().__init__() + + self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) + self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True) + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + # Ignore copy + self.mlp = Idefics2MLP( + hidden_size=config.hidden_size, + intermediate_size=config.intermediate_size, + hidden_act=config.hidden_act, + output_size=config.hidden_size, + ) + + def forward(self, hidden_state): + batch_size = hidden_state.shape[0] + probe = self.probe.repeat(batch_size, 1, 1) + + hidden_state = self.attention(probe, hidden_state, hidden_state)[0] + + residual = hidden_state + hidden_state = self.layernorm(hidden_state) + hidden_state = residual + self.mlp(hidden_state) + + return hidden_state[:, 0] + + +class Idefics2EncoderLayer(nn.Module): + def __init__(self, config: Idefics2Config): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = IDEFICS_VISION_ATTENTION_CLASSES[config._attn_implementation](config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = Idefics2VisionMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + # Copied from transformers.models.siglip.modeling_siglip.SiglipEncoderLayer.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): + Input to the layer of shape `(batch, seq_len, embed_dim)`. + attention_mask (`torch.FloatTensor`): + Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.siglip.modeling_siglip.SiglipEncoder with Siglip->Idefics2 +class Idefics2Encoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`Idefics2EncoderLayer`]. + + Args: + config: Idefics2Config + """ + + def __init__(self, config: Idefics2Config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([Idefics2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + # Ignore copy + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for encoder_layer in self.layers: + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class Idefics2VisionTransformer(nn.Module): + def __init__(self, config: Idefics2VisionConfig): + super().__init__() + embed_dim = config.hidden_size + + self.config = config + self.embeddings = Idefics2VisionEmbeddings(config) + self.encoder = Idefics2Encoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + def get_input_embeddings(self): + return self.embeddings + + def set_input_embeddings(self, value): + self.embeddings = value + + def forward( + self, + pixel_values, + patch_attention_mask: Optional[torch.BoolTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size = pixel_values.size(0) + if patch_attention_mask is None: + patch_size = self.config.patch_size + patch_attention_mask = torch.ones( + ( + batch_size, + pixel_values.size(2) // patch_size, + pixel_values.size(3) // patch_size, + ) + ) + patch_attention_mask = patch_attention_mask.to(dtype=torch.bool, device=pixel_values.device) + + hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask) + + patch_attention_mask = patch_attention_mask.view(batch_size, -1) + # The call to `_upad_input` in `_flash_attention_forward` is expensive + # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence), + # avoiding passing the attention_mask, which is equivalent to attending to the full sequence + if not torch.any(~patch_attention_mask): + patch_attention_mask = None + elif not self._use_flash_attention_2: + patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + attention_mask=patch_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + if not return_dict: + return (last_hidden_state,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=last_hidden_state, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +# Copied from transformers.models.llama.modeling_llama.repeat_kv +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +# Copied from transformers.models.llama.modeling_llama._get_unpad_data +def _get_unpad_data(attention_mask): + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Idefics2 +class Idefics2RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + Idefics2RMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + +class Idefics2PerceiverAttention(nn.Module): + def __init__(self, config, layer_idx: Optional[int] = None) -> None: + """Perceiver Cross-Attention Module --> let long-form inputs be `context`, resampled embeddings be `latents`""" + super().__init__() + + self.layer_idx = None + self.hidden_size = config.text_config.hidden_size + self.num_heads = config.perceiver_config.resampler_n_heads + self.head_dim = config.perceiver_config.resampler_head_dim + self.num_key_value_heads = config.perceiver_config.num_key_value_heads + self.num_key_value_groups = self.num_heads // self.num_key_value_heads + self.attention_dropout = config.perceiver_config.attention_dropout + + self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False) + self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False) + self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False) + + self.is_causal = False + + def forward( + self, + latents: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """ + Runs Perceiver Self-Attention, with special (context, latents) appended along the `seq` dimension! + + Args: + latents (`torch.Tensor`): Tensor of shape [bsz, n_latents, embed_dim] representing fixed length latents to compress to. + context (`torch.Tensor`): Tensor of shape [bsz, seq, embed_dim] representing long-form context to resample. + attention_mask (`torch.Tensor`, *optional*): Tensor of shape [bsz, 1, seq, n_latents] representing attention mask. + position_ids (`torch.LongTensor`, *optional*): Tensor of shape [bsz, seq] representing position indices of each input token. + past_key_value (`Tuple[torch.Tensor]`, *optional*): Tuple of tensors containing cached key and value states. + output_attentions (`bool`, *optional*, defaults to `False`): Whether to return attention weights. + use_cache (`bool`, *optional*, defaults to `False`): Whether to use past_key_value for caching. + """ + bsz, q_len, _ = latents.size() + kv_seq_len = q_len + context.size()[1] + + hidden_states = torch.concat([context, latents], dim=-2) + + query_states = self.q_proj(latents) + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + past_key_value = getattr(self, "past_key_value", past_key_value) + + if past_key_value is not None: + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx) + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + + attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) + + if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len): + raise ValueError( + f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, q_len, kv_seq_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}" + ) + + attn_weights = attn_weights + attention_mask + + # upcast attention to fp32 + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) + attn_output = torch.matmul(attn_weights, value_states) + + if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2).contiguous() + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim) + + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +# Copied from transformers.models.mistral.modeling_mistral.MistralFlashAttention2 with MistralAttention->Idefics2PerceiverAttention,MistralFlashAttention->Idefics2PerceiverFlashAttention,Mistral->Idefics2 +class Idefics2PerceiverFlashAttention2(Idefics2PerceiverAttention): + """ + Idefics2 flash attention module. This module inherits from `Idefics2PerceiverAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + # Copied from transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__ + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10() + + # Ignore copy + def forward( + self, + latents: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: bool = False, + use_cache: bool = False, + **kwargs, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = latents.size() + kv_seq_len = q_len + context.size()[1] + + # Query, Key, Value Projections --> Note that in Flamingo, latents are *concatenated* with context prior to attn! + # Note: This results in queries w/ `seq = n_latents`, and keys, values with `seq = len(context) + n_latents` + query_states = self.q_proj(latents) + key_states = self.k_proj(torch.cat([context, latents], dim=-2)) + value_states = self.v_proj(torch.cat([context, latents], dim=-2)) + + query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2) + key_states = key_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + value_states = value_states.view(bsz, kv_seq_len, self.num_key_value_heads, self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + if past_key_value is not None: + # Activate slicing cache only if the config has a value `sliding_windows` attribute + if hasattr(self.config, "sliding_window") and kv_seq_len > self.config.sliding_window: + slicing_tokens = kv_seq_len - self.config.sliding_window + + past_key = past_key_value[0] + past_value = past_key_value[1] + + past_key = past_key[:, :, slicing_tokens:, :].contiguous() + past_value = past_value[:, :, slicing_tokens:, :].contiguous() + + if past_key.shape[-2] != self.config.sliding_window - 1: + raise ValueError( + "past key must have a shape of (`batch_size, num_heads, self.config.sliding_window-1," + f" head_dim`), got {past_key.shape}" + ) + + past_key_value = (past_key, past_value) + + if attention_mask is not None: + attention_mask = attention_mask[:, slicing_tokens:] + attention_mask = torch.cat([attention_mask, torch.ones_like(attention_mask[:, -1:])], dim=-1) + + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # repeat k/v heads if n_kv_heads < n_heads + key_states = repeat_kv(key_states, self.num_key_value_groups) + value_states = repeat_kv(value_states, self.num_key_value_groups) + dropout_rate = 0.0 if not self.training else self.attention_dropout + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in float16 just to be sure everything works as expected. + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + # Reashape to the expected shape for Flash Attention + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + attn_output = self._flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=dropout_rate, + use_sliding_windows=False, + ) + + attn_output = attn_output.reshape(bsz, q_len, self.num_heads * self.head_dim).contiguous() + attn_output = self.o_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + def _flash_attention_forward( + self, + query_states, + key_states, + value_states, + attention_mask, + query_length, + dropout=0.0, + softmax_scale=None, + use_sliding_windows=False, + ): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_sliding_windows (`bool`, *optional*): + Whether to activate sliding window attention. + """ + if not self._flash_attn_uses_top_left_mask: + causal = self.is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in LlamaFlashAttention2 __init__. + causal = self.is_causal and query_length != 1 + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + if not use_sliding_windows: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + else: + if not use_sliding_windows: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + ) + else: + attn_output = flash_attn_func( + query_states, + key_states, + value_states, + dropout, + softmax_scale=softmax_scale, + causal=causal, + window_size=(self.config.sliding_window, self.config.sliding_window), + ) + + return attn_output + + def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length): + batch_size, kv_seq_len, num_heads, head_dim = key_layer.shape + + # On the first iteration we need to properly re-create the padding mask + # by slicing it on the proper place + if kv_seq_len != attention_mask.shape[-1]: + attention_mask_num_tokens = attention_mask.shape[-1] + attention_mask = attention_mask[:, attention_mask_num_tokens - kv_seq_len :] + + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + value_layer = index_first_axis(value_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k) + + if query_length == kv_seq_len: + query_layer = index_first_axis( + query_layer.reshape(batch_size * kv_seq_len, num_heads, head_dim), indices_k + ) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +IDEFICS2_PERCEIVER_ATTENTION_CLASSES = { + "eager": Idefics2PerceiverAttention, + "flash_attention_2": Idefics2PerceiverFlashAttention2, +} + + +class Idefics2PerceiverLayer(nn.Module): + def __init__(self, config, layer_idx: int): + super().__init__() + self.hidden_size = config.text_config.hidden_size + self.n_latents = config.perceiver_config.resampler_n_latents + self.depth = config.perceiver_config.resampler_depth + self.rms_norm_eps = config.text_config.rms_norm_eps + + self.input_latents_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) + self.input_context_norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) + self.self_attn = IDEFICS2_PERCEIVER_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx) + self.post_attention_layernorm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) + self.mlp = Idefics2MLP( + hidden_size=config.text_config.hidden_size, + intermediate_size=config.text_config.hidden_size * 4, + output_size=config.text_config.hidden_size, + hidden_act=config.perceiver_config.hidden_act, + ) + + def forward( + self, + latents: torch.Tensor, + context: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + latents (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + context (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states + """ + residual = latents + + latents = self.input_latents_norm(latents) + context = self.input_context_norm(context) + + latents, self_attn_weights, present_key_value = self.self_attn( + latents=latents, + context=context, + attention_mask=attention_mask, + ) + latents = residual + latents + residual = latents + + latents = self.post_attention_layernorm(latents) + latents = self.mlp(latents) + latents = residual + latents + + outputs = (latents,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class Idefics2PerceiverResampler(nn.Module): + def __init__(self, config) -> None: + """ + Instantiates a Perceiver Resampler that operates over a sequence of embeddings (say from a ResNet or ViT or + MAE) of a given dimension, performs `depth` blocks of cross-attention with a fixed `n_latents` inputs, then + returns a Tensor of shape [bsz, n_latents, embed_dim]. The Resampler acts as a form of learned pooling and + is derived from [Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206). + """ + super().__init__() + self.hidden_size = config.text_config.hidden_size + self.hidden_act = config.perceiver_config.hidden_act + self.n_latents = config.perceiver_config.resampler_n_latents + self.depth = config.perceiver_config.resampler_depth + self.rms_norm_eps = config.text_config.rms_norm_eps + + # Create Latents for Perceiver + self.latents = nn.Parameter(torch.ones(self.n_latents, self.hidden_size)) + + # Create Transformer Blocks + self.layers = nn.ModuleList([Idefics2PerceiverLayer(config, idx) for idx in range(self.depth)]) + self.norm = Idefics2RMSNorm(self.hidden_size, eps=self.rms_norm_eps) + + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + def forward( + self, + context: torch.Tensor, + attention_mask, + ) -> torch.Tensor: + # seq embed -> bsz seq embed + latents = self.latents.unsqueeze(0).expand((context.shape[0], *self.latents.size())) + + latent_attention_mask = torch.ones( + (attention_mask.size(0), latents.size(1)), dtype=attention_mask.dtype, device=attention_mask.device + ) + attention_mask = torch.cat([attention_mask, latent_attention_mask], dim=-1) + attention_mask = ( + _prepare_4d_attention_mask(attention_mask, latents.dtype, tgt_len=self.n_latents) + if not self._use_flash_attention_2 + else attention_mask + ) + + compressed_context = latents + for perceiver_layer in self.layers: + layer_outputs = perceiver_layer( + compressed_context, + context, + attention_mask=attention_mask, + position_ids=None, + past_key_value=None, + output_attentions=False, + use_cache=False, + ) + + compressed_context = layer_outputs[0] + + compressed_context = self.norm(compressed_context) + + return compressed_context + + +class Idefics2Connector(nn.Module): + def __init__(self, config): + super().__init__() + self.modality_projection = Idefics2MLP( + hidden_size=config.vision_config.hidden_size, + intermediate_size=config.text_config.intermediate_size, + output_size=config.text_config.hidden_size, + hidden_act=config.text_config.hidden_act, + ) + self.perceiver_resampler = Idefics2PerceiverResampler(config) + + def forward(self, image_hidden_states, attention_mask): + image_hidden_states = self.modality_projection(image_hidden_states) + image_hidden_states = self.perceiver_resampler(context=image_hidden_states, attention_mask=attention_mask) + return image_hidden_states + + +IDEFICS2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Idefics2Config`] or [`Idefics2VisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Idefics2 Model outputting raw hidden-states without any specific head on top.", + IDEFICS2_START_DOCSTRING, +) +class Idefics2PreTrainedModel(PreTrainedModel): + config_class = Idefics2Config + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["Idefics2VisionAttention", "Idefics2MLP", "Idefics2PerceiverLayer", "Idefics2DecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + + def _init_weights(self, module): + # important: this ported version of Idefics2 isn't meant for training from scratch - only + # inference and fine-tuning - so the proper init weights code has been removed - the original codebase + # https://github.com/haotian-liu/LLaVA/tree/main/idefics2 should serve for that purpose + std = ( + self.config.text_config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if hasattr(module, "class_embedding"): + module.class_embedding.data.normal_(mean=0.0, std=std) + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @classmethod + def _autoset_attn_implementation( + cls, + config, + use_flash_attention_2: bool = False, + torch_dtype: Optional[torch.dtype] = None, + device_map: Optional[Union[str, Dict[str, int]]] = None, + check_device_map: bool = True, + **kwargs, + ): + """ + Overrides the method in `PreTrainedModel` to update the vision config with the correct attention implementation + """ + config = super()._autoset_attn_implementation( + config=config, + use_flash_attention_2=use_flash_attention_2, + torch_dtype=torch_dtype, + device_map=device_map, + check_device_map=check_device_map, + **kwargs, + ) + config.vision_config._attn_implementation = config._attn_implementation + return config + + +IDEFICS2_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses + [`CLIPImageProcessor`] for processing images). + pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*): + Mask to avoid performing attention on padding pixel indices. + image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The hidden states of the image encoder after modality projection and perceiver resampling. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + """Idefics2 model consisting of a SIGLIP vision encoder and Mistral language decoder""", + IDEFICS2_START_DOCSTRING, +) +class Idefics2Model(Idefics2PreTrainedModel): + def __init__(self, config: Idefics2Config): + super().__init__(config) + self.padding_idx = self.config.text_config.pad_token_id + self.vocab_size = self.config.text_config.vocab_size + + self.vision_model = Idefics2VisionTransformer(config.vision_config) + self.connector = Idefics2Connector(config) + self.text_model = AutoModel.from_config(config.text_config) + + self.image_seq_len = config.perceiver_config.resampler_n_latents + self.image_token_id = self.config.image_token_id + + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.post_init() + + def enable_input_require_grads(self): + """ + Enables the gradients for the input embeddings. + + This is useful for lora when using gradient checkpointing. + c.f. https://github.com/huggingface/peft/issues/1402#issuecomment-1913675032 + + Override to set output.requires_grad = True for both the decoder's and vision model's embeddings. + """ + + def get_lowest_module(module): + if len(list(module.children())) == 0: + # If the module has no children, it is a leaf module (e.g., Linear, Conv2d, etc.) + return module + else: + # Recursively call the function on each child module + return get_lowest_module(list(module.children())[0]) + + def make_inputs_require_grads(module, input, output): + output.requires_grad_(True) + + self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) + self._vision_require_grads_hook = get_lowest_module(self.vision_model).register_forward_hook( + make_inputs_require_grads + ) + + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: + model_embeds = self.text_model.resize_token_embeddings( + new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of + ) + self.config.text_config.vocab_size = model_embeds.num_embeddings + return model_embeds + + def inputs_merger( + self, + input_ids: torch.LongTensor, + inputs_embeds: Optional[torch.Tensor], + image_hidden_states: Optional[torch.Tensor], + ): + """ + This method aims at merging the token embeddings with the image hidden states into one single sequence of vectors that are fed to the transformer LM. + The merging happens as follows: + - The text token sequence is: `tok_1 tok_2 tok_3 ... tok_4`. + - We get the image hidden states for the image through the vision encoder (and potentially the perceiver), and that hidden state is then projected into the text embedding space. + We thus have a sequence of image hidden states of size (1, image_seq_len, hidden_dim), where 1 is for batch_size of 1 image and hidden_dim is the hidden_dim of the LM transformer. + - The merging happens so that we obtain the following sequence: `vector_tok_1 vector_tok_2 vector_tok_3 vector_fake_tok_around_image {sequence of image_seq_len image hidden states} vector_fake_toke_around_image vector_tok_4`. That sequence is fed to the LM. + - To fit the format of that sequence, `input_ids`, `input_embeds`, `attention_mask` are all 3 adapted to insert the image hidden states. + """ + num_images, _, vision_hidden_size = image_hidden_states.shape + special_image_token_mask = input_ids == self.image_token_id + new_inputs_embeds = inputs_embeds.clone() + reshaped_image_hidden_states = image_hidden_states.view(-1, vision_hidden_size) + new_inputs_embeds[special_image_token_mask] = reshaped_image_hidden_states + return new_inputs_embeds + + @add_start_docstrings_to_model_forward( + """ + Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to + the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where + max_num_images is the maximum number of images among the batch_size samples in the batch. + + Padding images are not needed beyond padding the pixel_values at the entrance of the model. + For efficiency, we only pass through the vision_model's forward the real images by + discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where + image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3. + """, + IDEFICS2_INPUTS_DOCSTRING, + ) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_hidden_states: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Idefics2BaseModelOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + past_seen_tokens = 0 + if use_cache: + if not isinstance(past_key_values, Cache): + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + past_seen_tokens = past_key_values.get_usable_length(seq_length) + + if inputs_embeds is not None and input_ids is None and past_seen_tokens == 0: + raise ValueError("When first calling the model, if input_embeds are passed, input_ids should not be None.") + + if inputs_embeds is None: + inputs_embeds = self.text_model.get_input_embeddings()(input_ids) + + # START VISUAL INPUTS INTEGRATION + if pixel_values is not None and image_hidden_states is not None: + raise ValueError("You cannot specify both pixel_values and image_hidden_states at the same time") + elif pixel_values is not None: + batch_size, num_images, num_channels, height, width = pixel_values.shape + pixel_values = pixel_values.to(dtype=self.dtype) # fp16 compatibility + pixel_values = pixel_values.view(batch_size * num_images, *pixel_values.shape[2:]) + + # Remove padding images - padding images are full 0. + nb_values_per_image = pixel_values.shape[1:].numel() + real_images_inds = (pixel_values == 0.0).sum(dim=(-1, -2, -3)) != nb_values_per_image + pixel_values = pixel_values[real_images_inds].contiguous() + + # Handle the vision attention mask + if pixel_attention_mask is None: + pixel_attention_mask = torch.ones( + size=(pixel_values.size(0), pixel_values.size(2), pixel_values.size(3)), + dtype=torch.bool, + device=pixel_values.device, + ) + else: + # Remove padding images from the mask/pP p + pixel_attention_mask = pixel_attention_mask.view( + batch_size * num_images, *pixel_attention_mask.shape[2:] + ) + pixel_attention_mask = pixel_attention_mask[real_images_inds].contiguous() + + patch_size = self.config.vision_config.patch_size + patches_subgrid = pixel_attention_mask.unfold(dimension=1, size=patch_size, step=patch_size) + patches_subgrid = patches_subgrid.unfold(dimension=2, size=patch_size, step=patch_size) + patch_attention_mask = (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + # Get sequence from the vision encoder + image_hidden_states = self.vision_model( + pixel_values=pixel_values, + patch_attention_mask=patch_attention_mask, + ).last_hidden_state + + # Modality projection & resampling + image_hidden_states = self.connector( + image_hidden_states, attention_mask=patch_attention_mask.view(pixel_values.size(0), -1) + ) + + elif image_hidden_states is not None: + image_hidden_states = image_hidden_states.to(dtype=self.dtype, device=input_ids.device) + + if past_seen_tokens == 0 and inputs_embeds is not None and image_hidden_states is not None: + # When we generate, we don't want to replace the potential image_token_id that we generated by images + # that simply don't exist + inputs_embeds = self.inputs_merger( + input_ids=input_ids, + inputs_embeds=inputs_embeds, + image_hidden_states=image_hidden_states, + ) + + outputs = self.text_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return tuple(v for v in [*outputs, image_hidden_states] if v is not None) + + return Idefics2BaseModelOutputWithPast( + last_hidden_state=outputs.last_hidden_state, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_hidden_states, + ) + + +@add_start_docstrings( + """The Idefics2 Model with a language modeling head. It is made up a SigLIP vision encoder, with a language modeling head on top. """, + IDEFICS2_START_DOCSTRING, +) +class Idefics2ForConditionalGeneration(Idefics2PreTrainedModel): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + super().__init__(config) + self.model = Idefics2Model(config) + self.image_token_id = self.config.image_token_id + + self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False) + self.vocab_size = config.text_config.vocab_size + + # Initialize weights and apply final processing + self.post_init() + + def enable_input_require_grads(self): + """ + Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping + the model weights fixed. + """ + + def make_inputs_require_grads(module, input, output): + output.requires_grad_(True) + + self._text_require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) + self._vision_require_grads_hook = self.model.vision_model.get_input_embeddings().register_forward_hook( + make_inputs_require_grads + ) + + def get_input_embeddings(self): + return self.model.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.model.text_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: + # model_embeds = self.model.resize_token_embeddings(new_num_tokens=new_num_tokens, pad_to_multiple_of=pad_to_multiple_of) + model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + if new_num_tokens is None and pad_to_multiple_of is None: + return model_embeds + + # Update base model and current model config + # Ignore copy + self.config.text_config.vocab_size = model_embeds.weight.shape[0] + self.vocab_size = self.config.text_config.vocab_size + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + + def tie_weights(self): + """ + Overwrite `transformers.modeling_utils.PreTrainedModel.tie_weights` to handle the case of DecoupledLinear and DecoupledEmbedding. + """ + output_embeddings = self.get_output_embeddings() + input_embeddings = self.get_input_embeddings() + + if getattr(self.config, "tie_word_embeddings", True): + output_embeddings.weight = input_embeddings.weight + + @add_start_docstrings_to_model_forward(IDEFICS2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Idefics2CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_attention_mask: Optional[torch.BoolTensor] = None, + image_hidden_states: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Idefics2CausalLMOutputWithPast]: + r""" + Args: + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModelForVision2Seq + >>> from transformers.image_utils import load_image + + >>> DEVICE = "cuda:0" + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base") + >>> model = AutoModelForVision2Seq.from_pretrained( + ... "HuggingFaceM4/idefics2-8b-base", + >>> ).to(DEVICE) + + >>> BAD_WORDS_IDS = processor.tokenizer(["", ""], add_special_tokens=False).input_ids + >>> EOS_WORDS_IDS = [processor.tokenizer.eos_token_id] + + >>> # Create inputs + >>> prompts = [ + ... "In this image, we can see the city of New York, and more specifically the Statue of Liberty.In this image,", + ... "In which city is that bridge located?", + >>> ] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, padding=True, return_tensors="pt").to(DEVICE) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, bad_words_ids=BAD_WORDS_IDS, max_new_tokens=500) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + pixel_attention_mask=pixel_attention_mask, + image_hidden_states=image_hidden_states, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + labels = labels.to(logits.device) + # Shift so that tokens < n predict n + if attention_mask is not None: + shift_attention_mask = attention_mask[..., 1:].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss(ignore_index=self.image_token_id) + loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return Idefics2CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=outputs.image_hidden_states, + ) + + def prepare_inputs_for_generation( + self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # Omit tokens covered by past_key_values + if past_key_values is not None: + if isinstance(past_key_values, Cache): + cache_length = past_key_values.get_seq_length() + past_length = past_key_values.seen_tokens + max_cache_length = past_key_values.get_max_length() + else: + cache_length = past_length = past_key_values[0][0].shape[2] + max_cache_length = None + + # Keep only the unprocessed tokens: + # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where + # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as + # input) + if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]: + input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :] + # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard + # input_ids based on the past_length. + elif past_length < input_ids.shape[1]: + input_ids = input_ids[:, past_length:] + # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens. + + # If we are about to go beyond the maximum cache length, we need to crop the input attention mask. + if ( + max_cache_length is not None + and attention_mask is not None + and cache_length + input_ids.shape[1] > max_cache_length + ): + attention_mask = attention_mask[:, -max_cache_length:] + + position_ids = kwargs.get("position_ids", None) + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + image_hidden_states = kwargs.get("image_hidden_states", None) + if image_hidden_states is not None: + pixel_values = None + pixel_attention_mask = None + else: + pixel_values = kwargs.get("pixel_values", None) + pixel_attention_mask = kwargs.get("pixel_attention_mask", None) + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_attention_mask": pixel_attention_mask, + "image_hidden_states": image_hidden_states, + } + ) + return model_inputs + + def _update_model_kwargs_for_generation(self, outputs, model_kwargs, is_encoder_decoder, **kwargs): + model_kwargs = super()._update_model_kwargs_for_generation( + outputs=outputs, + model_kwargs=model_kwargs, + is_encoder_decoder=is_encoder_decoder, + **kwargs, + ) + # Get the precomputed image_hidden_states + model_kwargs["image_hidden_states"] = outputs.image_hidden_states + return model_kwargs + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM._reorder_cache + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py new file mode 100644 index 0000000000..7b98519928 --- /dev/null +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -0,0 +1,348 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for IDEFICS2. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional, Union + +from ...feature_extraction_utils import BatchFeature +from ...image_utils import ImageInput, is_valid_image, load_image +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import AddedToken, BatchEncoding, PaddingStrategy, TextInput, TruncationStrategy +from ...utils import TensorType, logging + + +if TYPE_CHECKING: + from ...pipelines.conversational import Conversation + from ...tokenization_utils_base import PreTokenizedInput + + +logger = logging.get_logger(__name__) + + +def is_url(val) -> bool: + return isinstance(val, str) and val.startswith("http") + + +def is_image_or_image_url(elem): + return is_url(elem) or is_valid_image(elem) + + +class Idefics2Processor(ProcessorMixin): + r""" + Constructs a IDEFICS2 processor which wraps a LLama tokenizer and IDEFICS2 image processor into a single processor. + + [`IdeficsProcessor`] offers all the functionalities of [`Idefics2ImageProcessor`] and [`LlamaTokenizerFast`]. See + the docstring of [`~IdeficsProcessor.__call__`] and [`~IdeficsProcessor.decode`] for more information. + + Args: + image_processor (`Idefics2ImageProcessor`): + An instance of [`Idefics2ImageProcessor`]. The image processor is a required input. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + image_seq_len (`int`, *optional*, defaults to 64): + The length of the image sequence i.e. the number of tokens per image in the input. + This parameter is used to build the string from the input prompt and image tokens and should match the + config.perceiver_config.resampler_n_latents value for the model used. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "Idefics2ImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer=None, image_seq_len: int = 64, **kwargs): + if image_processor is None: + raise ValueError("You need to specify an `image_processor`.") + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + self.fake_image_token = AddedToken("", normalized=False, special=True) + self.image_token = AddedToken("", normalized=False, special=True) + self.end_of_utterance_token = AddedToken("", normalized=False, special=True) + self.image_seq_len = image_seq_len + + tokens_to_add = { + "additional_special_tokens": [self.fake_image_token, self.image_token, self.end_of_utterance_token] + } + tokenizer.add_special_tokens(tokens_to_add) + + # Stores a Jinja template that formats chat histories into tokenizable strings + self.chat_template = kwargs.pop("chat_template", None) + + super().__init__(image_processor, tokenizer) + + def _extract_images_from_prompts(self, prompts): + prompt_images = [] + for prompt in prompts: + images = [] + for elem in prompt: + if is_valid_image(elem): + images.append(elem) + elif is_url(elem): + images.append(load_image(elem)) + prompt_images.append(images) + return prompt_images + + def __call__( + self, + text: Union[TextInput, "PreTokenizedInput", List[TextInput], List["PreTokenizedInput"]] = None, + images: Union[ImageInput, List[ImageInput], List[List[ImageInput]]] = None, + image_seq_len: Optional[int] = None, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + is_split_into_words: bool = False, + add_special_tokens: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + ) -> BatchEncoding: + """ + Processes the input prompts and returns a BatchEncoding. + + Example: + + ```python + >>> import requests + >>> from transformers import Idefics2Processor + >>> from transformers.image_utils import load_image + + >>> processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) + >>> processor.image_processor.do_image_splitting = False # Force as False to simplify the example + + >>> url1 = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + >>> url2 = "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg" + + >>> image1, image2 = load_image(url1), load_image(url2) + >>> images = [[image1], [image2]] + + >>> text = [ + ... "In this image, we see", + ... "bla bla bla", + ... ] + >>> outputs = processor(text=text, images=images, return_tensors="pt", padding=True) + >>> input_ids = outputs.input_ids + >>> input_tokens = processor.tokenizer.batch_decode(input_ids) + >>> print(input_tokens) + [' In this image, we see', ' bla bla bla'] + ``` + + Args: + text (`Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + + Wherever an image token, `` is encountered it is expanded to + `` + `` * `image_seq_len` * `. + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. If is of type `List[ImageInput]`, it's assumed that this is for a single prompt i.e. of batch size 1. + image_seq_len (`int`, *optional*): + The length of the image sequence. If not provided, the default value is used. + padding (`Union[bool, str, PaddingStrategy]`, *optional*, defaults to `False`): + Padding strategy applied to the input ids. See [`PreTrainedTokenizerFast.pad`] for more information. + truncation (`Union[bool, str, TruncationStrategy]`, *optional*): + Truncation strategy applied to the input ids. See [`PreTrainedTokenizerFast.truncate`] for more information. + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding/truncation length. See + [`PreTrainedTokenizerFast.__call__`] for more information. + is_split_into_words (`bool`, *optional*, defaults to `False`): + Whether the input text is split into words or not. If set to `True`, the tokenizer will skip the + tokenization process and assume the input is already tokenized. + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether to add special tokens or not. See [`PreTrainedTokenizerFast.__call__`] for more information. + return_tensors (`Union[str, TensorType]`, *optional*): + If set, will return tensors of a particular framework. See [`PreTrainedTokenizerFast.__call__`] for more + information. + """ + image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len + + n_images_in_text = [] + inputs = BatchFeature() + + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` + fake_image_token = self.fake_image_token.content + image_token = self.image_token.content + image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}" + + if self.image_processor.do_image_splitting: + # A single image token is split into 4 patches + 1 original image + image_str = image_str * 5 + + prompt_strings = [] + for sample in text: + n_images_in_text.append(sample.count(image_token)) + sample = sample.replace(image_token, image_str) + # Remove any double fake tokens if images are adjacent + sample = sample.replace(f"{fake_image_token}{fake_image_token}", f"{fake_image_token}") + prompt_strings.append(sample) + + text_inputs = self.tokenizer( + text=prompt_strings, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + is_split_into_words=is_split_into_words, + return_tensors=return_tensors, + ) + inputs.update(text_inputs) + + if images is not None: + if is_image_or_image_url(images): + images = [[images]] + elif isinstance(images, list) and is_image_or_image_url(images[0]): + images = [images] + elif ( + not isinstance(images, list) + and not isinstance(images[0], list) + and not is_image_or_image_url(images[0][0]) + ): + raise ValueError( + "Invalid input images. Please provide a single image or a list of images or a list of list of images." + ) + + n_images_in_images = [len(sample) for sample in images] + if text is not None and not n_images_in_images == n_images_in_text: + raise ValueError( + f"The number of images in the text {n_images_in_text} and images {n_images_in_images} should be the same." + ) + + # Load images if they are URLs + images = [[load_image(im) for im in sample] for sample in images] + image_inputs = self.image_processor(images, return_tensors=return_tensors) + inputs.update(image_inputs) + + return inputs + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + def apply_chat_template( + self, + conversation: Union[List[Dict[str, str]], "Conversation"], + chat_template: Optional[str] = None, + tokenize: bool = False, + **kwargs, + ) -> str: + """ + Overrides the tokenizer's `apply_chat_template` method to apply the IDEFICS2 chat template by default + if no chat template is provided. + + By default, the output isn't tokenized. This is because the IDEFICS2 chat template is designed to insert + the image token into the sequence according to the message, but does not handle expanding the image + tokens to the sequence length or adding the surrounding tokens e.g. . + + Args: + conversation (`Union[List[Dict, str, str], "Conversation"]`): + The conversation to format. + chat_template (`Optional[str]`, *optional*): + The Jinja template to use for formatting the conversation. If not provided, the default chat template + is used. + tokenize (`bool`, *optional*, defaults to `False`): + Whether to tokenize the output or not. + **kwargs: + Additional keyword arguments for the tokenizer's `apply_chat_template` method. + """ + + if chat_template is None: + if self.chat_template is not None: + chat_template = self.chat_template + else: + chat_template = self.default_chat_template + + return self.tokenizer.apply_chat_template( + conversation, chat_template=chat_template, tokenize=tokenize, **kwargs + ) + + @property + def default_chat_template(self): + """ + This template formats inputs in the form of a chat history. For each message in the chat history: + * the template will output the role of the speaker followed by the content of the message. + * content can be a single string or a list of strings and images. + * If the content element is an image, the template will output a sequence of tokens and token before and after each image + * The template will output an token at the end of each message. + + Example: + + ```python + messages = [{ + "role": "user", + "content": [ + {"type": "text", "text": "What’s in this image?"}, + {"type": "image"}, + {"type": "image"}, + ], + }, + { + "role": "assistant", + "content": [{"type": "text", "text": "This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground."},] + }] + ``` + + Will create outputs like: + ``` + User: What is in this Image? + Assistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground. + ``` + """ + # fmt: off + return ( + "{% for message in messages %}" + "{{message['role'].capitalize()}}" + "{% if message['content'][0]['type'] == 'image' %}" + "{{':'}}" + "{% else %}" + "{{': '}}" + "{% endif %}" + "{% for line in message['content'] %}" + "{% if line['type'] == 'text' %}" + "{{line['text']}}" + "{% elif line['type'] == 'image' %}" + "{{ '' }}" + "{% endif %}" + "{% endfor %}" + "\n" + "{% endfor %}" + + "{% if add_generation_prompt %}" + "{{ 'Assistant:' }}" + "{% endif %}" + ) + # fmt: on diff --git a/src/transformers/utils/dummy_pt_objects.py b/src/transformers/utils/dummy_pt_objects.py index 15dc4f9bc2..b934a07321 100644 --- a/src/transformers/utils/dummy_pt_objects.py +++ b/src/transformers/utils/dummy_pt_objects.py @@ -4405,6 +4405,37 @@ class IdeficsProcessor(metaclass=DummyObject): requires_backends(self, ["torch"]) +IDEFICS2_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class Idefics2ForConditionalGeneration(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Idefics2Model(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Idefics2PreTrainedModel(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + +class Idefics2Processor(metaclass=DummyObject): + _backends = ["torch"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["torch"]) + + IMAGEGPT_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/src/transformers/utils/dummy_vision_objects.py b/src/transformers/utils/dummy_vision_objects.py index 80b418adc1..7510f91dfc 100644 --- a/src/transformers/utils/dummy_vision_objects.py +++ b/src/transformers/utils/dummy_vision_objects.py @@ -261,6 +261,13 @@ class IdeficsImageProcessor(metaclass=DummyObject): requires_backends(self, ["vision"]) +class Idefics2ImageProcessor(metaclass=DummyObject): + _backends = ["vision"] + + def __init__(self, *args, **kwargs): + requires_backends(self, ["vision"]) + + class ImageGPTFeatureExtractor(metaclass=DummyObject): _backends = ["vision"] diff --git a/tests/models/idefics2/__init__.py b/tests/models/idefics2/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/models/idefics2/test_image_processing_idefics2.py b/tests/models/idefics2/test_image_processing_idefics2.py new file mode 100644 index 0000000000..4b3af1f632 --- /dev/null +++ b/tests/models/idefics2/test_image_processing_idefics2.py @@ -0,0 +1,270 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import unittest + +import numpy as np + +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_torch_available, is_vision_available + +from ...test_image_processing_common import ImageProcessingTestMixin + + +if is_vision_available(): + from PIL import Image + + from transformers import Idefics2ImageProcessor + + +if is_torch_available(): + import torch + + +class Idefics2ImageProcessingTester(unittest.TestCase): + def __init__( + self, + parent, + batch_size=7, + num_channels=3, + num_images=1, + image_size=18, + min_resolution=30, + max_resolution=400, + do_resize=True, + size=None, + do_rescale=True, + rescale_factor=1 / 255, + do_normalize=True, + image_mean=[0.5, 0.5, 0.5], + image_std=[0.5, 0.5, 0.5], + do_convert_rgb=True, + do_pad=True, + do_image_splitting=True, + ): + size = size if size is not None else {"shortest_edge": 378, "longest_edge": 980} + self.parent = parent + self.batch_size = batch_size + self.num_channels = num_channels + self.num_images = num_images + self.image_size = image_size + self.min_resolution = min_resolution + self.max_resolution = max_resolution + self.do_resize = do_resize + self.size = size + self.do_normalize = do_normalize + self.image_mean = image_mean + self.image_std = image_std + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_convert_rgb = do_convert_rgb + self.do_pad = do_pad + self.do_image_splitting = do_image_splitting + + def prepare_image_processor_dict(self): + return { + "do_convert_rgb": self.do_convert_rgb, + "do_resize": self.do_resize, + "size": self.size, + "do_rescale": self.do_rescale, + "rescale_factor": self.rescale_factor, + "do_normalize": self.do_normalize, + "image_mean": self.image_mean, + "image_std": self.image_std, + "do_pad": self.do_pad, + "do_image_splitting": self.do_image_splitting, + } + + def get_expected_values(self, image_inputs, batched=False): + """ + This function computes the expected height and width when providing images to BridgeTowerImageProcessor, + assuming do_resize is set to True with a scalar size and size_divisor. + """ + if not batched: + shortest_edge = self.size["shortest_edge"] + longest_edge = self.size["longest_edge"] + image = image_inputs[0] + if isinstance(image, Image.Image): + w, h = image.size + else: + h, w = image.shape[1], image.shape[2] + + aspect_ratio = w / h + if w > h and w >= longest_edge: + w = longest_edge + h = int(w / aspect_ratio) + elif h > w and h >= longest_edge: + h = longest_edge + w = int(h * aspect_ratio) + w = max(w, shortest_edge) + h = max(h, shortest_edge) + expected_height = h + expected_width = w + else: + expected_values = [] + for images in image_inputs: + for image in images: + expected_height, expected_width = self.get_expected_values([image]) + expected_values.append((expected_height, expected_width)) + expected_height = max(expected_values, key=lambda item: item[0])[0] + expected_width = max(expected_values, key=lambda item: item[1])[1] + + return expected_height, expected_width + + def expected_output_image_shape(self, images): + height, width = self.get_expected_values(images, batched=True) + effective_nb_images = self.num_images * 5 if self.do_image_splitting else 1 + return effective_nb_images, self.num_channels, height, width + + def prepare_image_inputs( + self, + batch_size=None, + min_resolution=None, + max_resolution=None, + num_channels=None, + num_images=None, + size_divisor=None, + equal_resolution=False, + numpify=False, + torchify=False, + ): + """This function prepares a list of PIL images, or a list of numpy arrays if one specifies numpify=True, + or a list of PyTorch tensors if one specifies torchify=True. + + One can specify whether the images are of the same resolution or not. + """ + assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" + + batch_size = batch_size if batch_size is not None else self.batch_size + min_resolution = min_resolution if min_resolution is not None else self.min_resolution + max_resolution = max_resolution if max_resolution is not None else self.max_resolution + num_channels = num_channels if num_channels is not None else self.num_channels + num_images = num_images if num_images is not None else self.num_images + + images_list = [] + for i in range(batch_size): + images = [] + for j in range(num_images): + if equal_resolution: + width = height = max_resolution + else: + # To avoid getting image width/height 0 + if size_divisor is not None: + # If `size_divisor` is defined, the image needs to have width/size >= `size_divisor` + min_resolution = max(size_divisor, min_resolution) + width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) + images.append(np.random.randint(255, size=(num_channels, width, height), dtype=np.uint8)) + images_list.append(images) + + if not numpify and not torchify: + # PIL expects the channel dimension as last dimension + images_list = [[Image.fromarray(np.moveaxis(image, 0, -1)) for image in images] for images in images_list] + + if torchify: + images_list = [[torch.from_numpy(image) for image in images] for images in images_list] + + return images_list + + +@require_torch +@require_vision +class Idefics2ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase): + image_processing_class = Idefics2ImageProcessor if is_vision_available() else None + + def setUp(self): + self.image_processor_tester = Idefics2ImageProcessingTester(self) + + @property + def image_processor_dict(self): + return self.image_processor_tester.prepare_image_processor_dict() + + def test_image_processor_properties(self): + image_processing = self.image_processing_class(**self.image_processor_dict) + self.assertTrue(hasattr(image_processing, "do_convert_rgb")) + self.assertTrue(hasattr(image_processing, "do_resize")) + self.assertTrue(hasattr(image_processing, "size")) + self.assertTrue(hasattr(image_processing, "do_rescale")) + self.assertTrue(hasattr(image_processing, "rescale_factor")) + self.assertTrue(hasattr(image_processing, "do_normalize")) + self.assertTrue(hasattr(image_processing, "image_mean")) + self.assertTrue(hasattr(image_processing, "image_std")) + self.assertTrue(hasattr(image_processing, "do_pad")) + self.assertTrue(hasattr(image_processing, "do_image_splitting")) + + def test_call_numpy(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random numpy tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, numpify=True) + for sample_images in image_inputs: + for image in sample_images: + self.assertIsInstance(image, np.ndarray) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pil(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PIL images + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False) + for images in image_inputs: + for image in images: + self.assertIsInstance(image, Image.Image) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + self.assertEqual( + tuple(encoded_images.shape), (self.image_processor_tester.batch_size, *expected_output_image_shape) + ) + + def test_call_pytorch(self): + # Initialize image_processing + image_processing = self.image_processing_class(**self.image_processor_dict) + # create random PyTorch tensors + image_inputs = self.image_processor_tester.prepare_image_inputs(equal_resolution=False, torchify=True) + + for images in image_inputs: + for image in images: + self.assertIsInstance(image, torch.Tensor) + + # Test not batched input + encoded_images = image_processing(image_inputs[0], return_tensors="pt").pixel_values + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape([image_inputs[0]]) + self.assertEqual(tuple(encoded_images.shape), (1, *expected_output_image_shape)) + + # Test batched + expected_output_image_shape = self.image_processor_tester.expected_output_image_shape(image_inputs) + encoded_images = image_processing(image_inputs, return_tensors="pt").pixel_values + self.assertEqual( + tuple(encoded_images.shape), + (self.image_processor_tester.batch_size, *expected_output_image_shape), + ) diff --git a/tests/models/idefics2/test_modeling_idefics2.py b/tests/models/idefics2/test_modeling_idefics2.py new file mode 100644 index 0000000000..5553c972e6 --- /dev/null +++ b/tests/models/idefics2/test_modeling_idefics2.py @@ -0,0 +1,528 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Testing suite for the PyTorch Idefics2 model.""" + +import copy +import gc +import unittest +from io import BytesIO + +import requests + +from transformers import ( + AutoProcessor, + Idefics2Config, + Idefics2ForConditionalGeneration, + Idefics2Model, + is_torch_available, + is_vision_available, +) +from transformers.testing_utils import require_bitsandbytes, require_torch, slow, torch_device + +from ...generation.test_utils import GenerationTesterMixin +from ...test_configuration_common import ConfigTester +from ...test_modeling_common import ModelTesterMixin, floats_tensor, ids_tensor + + +if is_torch_available(): + import torch +else: + is_torch_greater_or_equal_than_2_0 = False + +if is_vision_available(): + from PIL import Image + + +class Idefics2VisionText2TextModelTester: + def __init__( + self, + parent, + is_training=True, + batch_size=2, + num_images=2, + seq_length=10, + vision_config={ + "image_size": 12, + "patch_size": 12, + "num_channels": 3, + "hidden_size": 32, + "num_hidden_layers": 2, + "num_attention_heads": 4, + "intermediate_size": 32, + "dropout": 0.1, + "attention_dropout": 0.1, + "initializer_range": 0.02, + }, + perceiver_config={ + "hidden_act": "silu", + "resampler_n_latents": 2, + "resampler_depth": 2, + "resampler_n_heads": 2, + "num_key_value_heads": 1, + "resampler_head_dim": 12, + "attention_dropout": 0.0, + }, + text_config={ + "vocab_size": 100, + "hidden_size": 64, + "intermediate_size": 56, + "num_hidden_layers": 3, + "num_attention_heads": 2, + "num_key_value_heads": 2, + "hidden_act": "silu", + "max_position_embeddings": 256, + "initializer_range": 0.02, + "rms_norm_eps": 1e-6, + "pad_token_id": 0, # None in the original configuration_mistral, we set it to the unk_token_id + "bos_token_id": 1, + "eos_token_id": 2, + "image_token_id": 32_001, + "tie_word_embeddings": False, + "rope_theta": 10000.0, + "sliding_window": 32, + "attention_dropout": 0.0, + }, + use_cache=False, + tie_word_embeddings=False, + image_token_id=99, + ): + self.parent = parent + self.is_training = is_training + self.batch_size = batch_size + self.num_images = num_images + self.num_channels = 3 + self.seq_length = seq_length + self.use_cache = use_cache + self.image_token_id = image_token_id + self.tie_word_embeddings = tie_word_embeddings + # Hack - add properties here so use common tests + self.vocab_size = text_config["vocab_size"] + self.num_hidden_layers = text_config["num_hidden_layers"] + self.num_attention_heads = text_config["num_attention_heads"] + self.hidden_size = text_config["hidden_size"] + + self.vision_config = vision_config + self.perceiver_config = perceiver_config + self.text_config = text_config + + def get_config(self): + return Idefics2Config( + use_cache=self.use_cache, + image_token_id=self.image_token_id, + tie_word_embeddings=self.tie_word_embeddings, + vision_config=self.vision_config, + perceiver_config=self.perceiver_config, + text_config=self.text_config, + vocab_size=self.vocab_size, + ) + + def prepare_config_and_inputs(self): + pixel_values = floats_tensor( + [ + self.batch_size, + self.num_images, + self.vision_config["num_channels"], + self.vision_config["image_size"], + self.vision_config["image_size"], + ] + ) + config = self.get_config() + + return config, pixel_values + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + config, pixel_values = config_and_inputs + input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 2) + 1 + + # For simplicity just set the last n tokens to the image token + n_image_tokens_per_batch = self.num_images * self.perceiver_config["resampler_n_latents"] + input_ids[:, -n_image_tokens_per_batch:] = self.image_token_id + attention_mask = input_ids.ne(1).to(torch_device) + inputs_dict = { + "pixel_values": pixel_values, + "input_ids": input_ids, + "attention_mask": attention_mask, + } + return config, inputs_dict + + +@require_torch +class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics2`. + """ + + all_model_classes = (Idefics2Model,) if is_torch_available() else () + fx_compatible = False + test_torchscript = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + + def setUp(self): + self.model_tester = Idefics2VisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + + @unittest.skip("input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip("Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + + @unittest.skip("Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + if self.model_tester.is_training is False: + model.eval() + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Ignore copy + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # make sure that decoder_input_ids are resized as well + if "decoder_input_ids" in inputs_dict: + inputs_dict["decoder_input_ids"].clamp_(max=model_vocab_size - 15 - 1) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # if no output embeddings -> leave test + if model.get_output_embeddings() is None: + continue + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + +@require_torch +class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase): + """ + Model tester for `Idefics2ForConditionalGeneration`. + """ + + all_model_classes = (Idefics2ForConditionalGeneration,) if is_torch_available() else () + fx_compatible = False + test_pruning = False + test_resize_embeddings = True + test_head_masking = False + test_torchscript = False + + def setUp(self): + self.model_tester = Idefics2VisionText2TextModelTester(self) + self.config_tester = ConfigTester(self, config_class=Idefics2Config, has_text_modality=False) + + @unittest.skip("input_embeds cannot be passed in without input_ids") + def test_inputs_embeds(): + pass + + @unittest.skip("Model does not support padding right") + def test_flash_attn_2_generate_padding_right(self): + pass + + @unittest.skip("Model does not support padding right") + def test_flash_attn_2_inference_padding_right(self): + pass + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_tokens_embeddings(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + # Retrieve the embeddings and clone theme + model_embed = model.resize_token_embeddings(model_vocab_size) + cloned_embeddings = model_embed.weight.clone() + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model_embed = model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + self.assertEqual(model_embed.weight.shape[0], cloned_embeddings.shape[0] - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that adding and removing tokens has not modified the first part of the embedding matrix. + models_equal = True + for p1, p2 in zip(cloned_embeddings, model_embed.weight): + if p1.data.ne(p2.data).sum() > 0: + models_equal = False + + self.assertTrue(models_equal) + + config = copy.deepcopy(original_config) + model = model_class(config) + model.to(torch_device) + + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10, pad_to_multiple_of=1) + self.assertTrue(model.config.text_config.vocab_size + 10, model_vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + self.assertTrue(model_embed.weight.shape[0], model.config.text_config.vocab_size) + self.assertTrue(model.config.text_config.vocab_size, model.vocab_size) + + model_embed = model.resize_token_embeddings(model_vocab_size + 13, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0] // 64, 0) + + # Check that resizing a model to a multiple of pad_to_multiple leads to a model of exactly that size + target_dimension = 128 + model_embed = model.resize_token_embeddings(target_dimension, pad_to_multiple_of=64) + self.assertTrue(model_embed.weight.shape[0], target_dimension) + + with self.assertRaisesRegex( + ValueError, + "Asking to pad the embedding matrix to a multiple of `1.3`, which is not and integer. Please make sure to pass an integer", + ): + model.resize_token_embeddings(model_vocab_size, pad_to_multiple_of=1.3) + + # We need to override as we need to prepare such that the image token is the last token + def test_resize_embeddings_untied(self): + (original_config, inputs_dict) = self.model_tester.prepare_config_and_inputs_for_common() + + original_config.tie_word_embeddings = False + + for model_class in self.all_model_classes: + config = copy.deepcopy(original_config) + model = model_class(config).to(torch_device) + + # Check that resizing the token embeddings with a larger vocab size increases the model's vocab size + model_vocab_size = config.text_config.vocab_size + model.resize_token_embeddings(model_vocab_size + 10) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size + 10) + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size + 10) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size + 10) + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + # Check that resizing the token embeddings with a smaller vocab size decreases the model's vocab size + model.resize_token_embeddings(model_vocab_size - 15) + self.assertEqual(model.config.text_config.vocab_size, model_vocab_size - 15) + # Check that it actually resizes the embeddings matrix + output_embeds = model.get_output_embeddings() + self.assertEqual(output_embeds.weight.shape[0], model_vocab_size - 15) + # Check bias if present + if output_embeds.bias is not None: + self.assertEqual(output_embeds.bias.shape[0], model_vocab_size - 15) + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + # Input ids should be clamped to the maximum size of the vocabulary - 1 and the image token should be the last token + inputs_dict["input_ids"].clamp_(max=model_vocab_size - 15 - 2) + n_images = self.model_tester.num_images * self.model_tester.perceiver_config["resampler_n_latents"] + model.model.image_token_id = model_vocab_size - 15 - 1 + inputs_dict["input_ids"][:, -n_images:] = model.model.image_token_id + + # Check that the model can still do a forward pass successfully (every parameter should be resized) + model(**self._prepare_for_class(inputs_dict, model_class)) + + +@require_torch +class Idefics2ForConditionalGenerationIntegrationTest(unittest.TestCase): + def setUp(self): + self.processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b-base") + self.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + self.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + self.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + + def tearDown(self): + gc.collect() + torch.cuda.empty_cache() + + @slow + def test_integration_test(self): + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", + torch_dtype=torch.bfloat16, + device_map="auto", + ) + model.to(torch_device) + + # Create inputs + text = "In this image, we see" + images = self.image1 + inputs = self.processor(text=text, images=images, return_tensors="pt", padding=True) + inputs.to(torch_device) + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + # Batch affects generated text. Single batch output: ['In this image, we see the Statue of Liberty in the foreground and'] + expected_generated_text = "In this image, we see the Statue of Liberty, the New York City" + self.assertEqual(generated_texts[0], expected_generated_text) + + @slow + @require_bitsandbytes + def test_integration_test_4bit(self): + # Let' s make sure we test the preprocessing to replace what is used + model = Idefics2ForConditionalGeneration.from_pretrained( + "HuggingFaceM4/idefics2-8b-base", load_in_4bit=True, device_map="auto" + ) + + # Create pixel inputs + text = ["In this image, we see", "bla, bla "] + images = [[self.image1], [self.image2, self.image3]] + inputs = self.processor(text=text, images=images, padding=True, return_tensors="pt") + + generated_ids = model.generate(**inputs, max_new_tokens=10) + generated_texts = self.processor.batch_decode(generated_ids, skip_special_tokens=True) + + expected_generated_text = "In this image, we see the Statue of Liberty, the Hudson River," + self.assertEqual(generated_texts[0], expected_generated_text) diff --git a/tests/models/idefics2/test_processing_idefics2.py b/tests/models/idefics2/test_processing_idefics2.py new file mode 100644 index 0000000000..2fd569f991 --- /dev/null +++ b/tests/models/idefics2/test_processing_idefics2.py @@ -0,0 +1,235 @@ +# coding=utf-8 +# Copyright 2024 HuggingFace Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from io import BytesIO + +import requests + +from transformers import Idefics2Processor +from transformers.testing_utils import require_torch, require_vision +from transformers.utils import is_vision_available + + +if is_vision_available(): + from PIL import Image + + +@require_torch +@require_vision +class Idefics2ProcessorTest(unittest.TestCase): + def setUp(self): + self.processor = Idefics2Processor.from_pretrained("HuggingFaceM4/idefics2-8b", image_seq_len=2) + self.image1 = Image.open( + BytesIO( + requests.get( + "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" + ).content + ) + ) + self.image2 = Image.open( + BytesIO(requests.get("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg").content) + ) + self.image3 = Image.open( + BytesIO( + requests.get( + "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg" + ).content + ) + ) + self.bos_token = self.processor.tokenizer.bos_token + self.image_token = self.processor.image_token.content + self.fake_image_token = self.processor.fake_image_token.content + + self.bos_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.bos_token) + self.image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.image_token) + self.fake_image_token_id = self.processor.tokenizer.convert_tokens_to_ids(self.fake_image_token) + self.image_seq_len = self.processor.image_seq_len + + def test_process_interleaved_images_prompts_no_image_splitting(self): + old_image_splitting = self.processor.image_processor.do_image_splitting + + self.processor.image_processor.do_image_splitting = False + + # Test that a single image is processed correctly + inputs = self.processor(images=self.image1) + self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980)) + # fmt: on + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = self.processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(inputs["pixel_values"].shape, (1, 1, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 1, 653, 980)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = self.processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False) + expected_input_ids_1 = [self.bos_token_id] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len + [self.fake_image_token_id] + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(inputs['pixel_values'].shape, (2, 2, 3, 767, 980)) + self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 2, 767, 980)) + # fmt: on + + self.processor.image_processor.do_image_splitting = old_image_splitting + + def test_process_interleaved_images_prompts_image_splitting(self): + old_image_splitting = self.processor.image_processor.do_image_splitting + + self.processor.image_processor.do_image_splitting = True + + # Test that a single image is processed correctly + inputs = self.processor(images=self.image1) + self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980)) + # fmt: on + + # Test a single sample with image and text + image_str = "" + text_str = "In this image, we see" + text = image_str + text_str + inputs = self.processor(text=text, images=self.image1) + + # fmt: off + tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [[self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence["input_ids"]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + self.assertEqual(inputs["attention_mask"], [[1] * len(expected_input_ids[0])]) + self.assertEqual(inputs["pixel_values"].shape, (1, 5, 3, 653, 980)) + self.assertEqual(inputs["pixel_attention_mask"].shape, (1, 5, 653, 980)) + # fmt: on + + # Test that batch is correctly processed + image_str = "" + text_str_1 = "In this image, we see" + text_str_2 = "bla, bla" + + text = [ + image_str + text_str_1, + text_str_2 + image_str + image_str, + ] + images = [[self.image1], [self.image2, self.image3]] + + inputs = self.processor(text=text, images=images, padding=True) + + # fmt: off + tokenized_sentence_1 = self.processor.tokenizer(text_str_1, add_special_tokens=False) + tokenized_sentence_2 = self.processor.tokenizer(text_str_2, add_special_tokens=False) + expected_input_ids_1 = [self.bos_token_id] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + tokenized_sentence_1["input_ids"] + expected_input_ids_2 = [self.bos_token_id] + tokenized_sentence_2["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * 5 + [self.fake_image_token_id] + # Pad the first input to match the second input + pad_len = len(expected_input_ids_2) - len(expected_input_ids_1) + padded_expected_input_ids_1 = [0] * pad_len + expected_input_ids_1 + + self.assertEqual( + inputs["input_ids"], [padded_expected_input_ids_1, expected_input_ids_2] + ) + self.assertEqual( + inputs["attention_mask"], + [[0] * pad_len + [1] * len(expected_input_ids_1), [1] * len(expected_input_ids_2)] + ) + self.assertEqual(inputs['pixel_values'].shape, (2, 10, 3, 767, 980)) + self.assertEqual(inputs['pixel_attention_mask'].shape, (2, 10, 767, 980)) + # fmt: on + + self.processor.image_processor.do_image_splitting = old_image_splitting + + def test_add_special_tokens_processor(self): + image_str = "" + text_str = "In this image, we see" + text = text_str + image_str + + n_image_repeat = 5 if self.processor.image_processor.do_image_splitting else 1 + + # fmt: off + inputs = self.processor(text=text, images=self.image1, add_special_tokens=False) + tokenized_sentence = self.processor.tokenizer(text_str, add_special_tokens=False) + expected_input_ids = [tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + + inputs = self.processor(text=text, images=self.image1) + expected_input_ids = [[self.bos_token_id] + tokenized_sentence["input_ids"] + ([self.fake_image_token_id] + [self.image_token_id] * self.image_seq_len) * n_image_repeat + [self.fake_image_token_id]] + self.assertEqual(inputs["input_ids"], expected_input_ids) + # fmt: on + + def test_apply_chat_template(self): + # Message contains content which a mix of lists with images and image urls and string + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "What do these images show?"}, + {"type": "image"}, + {"type": "image"}, + "What do these images show?", + ], + }, + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.", + } + ], + }, + {"role": "user", "content": [{"type": "text", "text": "And who is that?"}]}, + ] + + processor = self.processor + # Make short sequence length to test that the fake tokens are added correctly + rendered = processor.apply_chat_template(messages, add_generation_prompt=True) + + expected_rendered = ( + "User: What do these images show?\n" + "Assistant: The first image shows the statue of Liberty in New York. The second image picture depicts Idefix, the dog of Obelix in Asterix and Obelix.\n" + "User: And who is that?\n" + "Assistant:" + ) + self.assertEqual(rendered, expected_rendered) diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index e92aca1cd7..a396ac752d 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -60,6 +60,7 @@ from transformers.models.auto.modeling_auto import ( MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES, + MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES, MODEL_MAPPING_NAMES, ) from transformers.testing_utils import ( @@ -220,6 +221,7 @@ class ModelTesterMixin: *get_values(MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES), *get_values(MODEL_FOR_MASKED_LM_MAPPING_NAMES), *get_values(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES), + *get_values(MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES), ]: inputs_dict["labels"] = torch.zeros( (self.model_tester.batch_size, self.model_tester.seq_length), dtype=torch.long, device=torch_device diff --git a/utils/slow_documentation_tests.txt b/utils/slow_documentation_tests.txt index 65e05ed893..dc5a6b5c30 100644 --- a/utils/slow_documentation_tests.txt +++ b/utils/slow_documentation_tests.txt @@ -9,6 +9,7 @@ docs/source/en/tasks/prompting.md src/transformers/models/blip_2/modeling_blip_2.py src/transformers/models/ctrl/modeling_ctrl.py src/transformers/models/fuyu/modeling_fuyu.py +src/transformers/models/idefics2/modeling_idefics2.py src/transformers/models/kosmos2/modeling_kosmos2.py src/transformers/models/musicgen_melody/modeling_musicgen_melody.py -src/transformers/models/musicgen_melody/processing_musicgen_melody.py \ No newline at end of file +src/transformers/models/musicgen_melody/processing_musicgen_melody.py From 8127f39624f587bdb04d55ab655df1753de7720a Mon Sep 17 00:00:00 2001 From: Sayak Paul Date: Mon, 15 Apr 2024 22:00:59 +0530 Subject: [PATCH 110/110] [Docs] Update recurrent_gemma.md for some minor nits (#30238) Update recurrent_gemma.md --- docs/source/en/model_doc/recurrent_gemma.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/source/en/model_doc/recurrent_gemma.md b/docs/source/en/model_doc/recurrent_gemma.md index 35a8ce9e3a..ceee799159 100644 --- a/docs/source/en/model_doc/recurrent_gemma.md +++ b/docs/source/en/model_doc/recurrent_gemma.md @@ -26,9 +26,9 @@ The abstract from the paper is the following: Tips: -- The original checkpoints can be converted using the conversion script `src/transformers/models/recurrent_gemma/convert_recurrent_gemma_weights_to_hf.py` +- The original checkpoints can be converted using the conversion script [`src/transformers/models/recurrent_gemma/convert_recurrent_gemma_weights_to_hf.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/recurrent_gemma/convert_recurrent_gemma_to_hf.py). -This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). +This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ). The original code can be found [here](https://github.com/google-deepmind/recurrentgemma). ## RecurrentGemmaConfig