updates

2024-05-29 13:05:07 +02:00 · 2024-05-29 13:05:07 +02:00 · e3e6ccac62
parent 058b6fa71d
commit e3e6ccac62
9 changed files with 138 additions and 107 deletions
--- a/examples/diff_conversion/diff_gemma3.py
+++ b/examples/diff_conversion/diff_gemma3.py
@ -0,0 +1,40 @@
+from transformers.models.llama.modeling_llama import LlamaModel
+from typing import *
+import torch
+from math import log
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from transformers import Cache
+
+def _pre_process_input(input_ids):
+    print(log(input_ids))
+    return input_ids
+
+# example where we need some deps and some functions
+class GemmaModel(LlamaModel):
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        _pre_process_input(input_ids)
+
+        return super().forward(
+            None,
+            attention_mask,
+            position_ids,
+            past_key_values,
+            inputs_embeds,
+            use_cache,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            cache_position,
+        )
--- a/examples/diff_conversion/diff_llama1.py
+++ b/examples/diff_conversion/diff_llama1.py
@ -0,0 +1,27 @@
+from transformers.models.llama.modeling_llama import LlamaConfig
+# Example where we only want to overwrite the defaults of an init?
+class GemmaConfig(LlamaConfig):
+    def __init__(
+        self,
+        vocab_size=256000,
+        hidden_size=3072,
+        intermediate_size=24576,
+        num_hidden_layers=28,
+        num_attention_heads=16,
+        num_key_value_heads=16,
+        head_dim=256,
+        hidden_act="gelu_pytorch_tanh",
+        hidden_activation=None,
+        max_position_embeddings=8192,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        eos_token_id=1,
+        bos_token_id=2,
+        tie_word_embeddings=True,
+        rope_theta=10000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+    ):
+        super().__init__(self)
--- a/examples/diff_conversion/diff_llama2.py
+++ b/examples/diff_conversion/diff_llama2.py
@ -0,0 +1,13 @@
+from transformers.models.llama.modeling_llama import LlamaConfig
+# Example where we only want to only add a new config argument and new arg doc
+# here there is no `ARG` so we are gonna take parent doc
+class GemmaConfig(LlamaConfig):
+    r"""
+        mlp_bias (`bool`, *optional*, defaults to `False`)
+    """
+    def __init__(
+        self,
+        mlp_bias=False 
+    ):
+        self.mlp_bias = mlp_bias
+        super().__init__(self)
--- a/examples/diff_conversion/diff_llama3.py
+++ b/examples/diff_conversion/diff_llama3.py
@ -0,0 +1,28 @@
+from transformers.models.llama.modeling_llama import LlamaForSequenceClassification
+from transformers.models.llama.configuration_llama import LlamaConfig
+# Example where we only want to only modify the docstring
+class GemmaConfig(LlamaConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Gemma-7B.
+    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256000):
+            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`GemmaModel`]
+    ```python
+    >>> from transformers import GemmaModel, GemmaConfig
+    >>> # Initializing a Gemma gemma-7b style configuration
+    >>> configuration = GemmaConfig()
+    >>> # Initializing a model from the gemma-7b style configuration
+    >>> model = GemmaModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+# Example where alllllll the dependencies are fetched to just copy the entire class
+class GemmaForSequenceClassification(LlamaForSequenceClassification):
+    pass
--- a/src/transformers/models/gemma/configuration_gemma.py
+++ b/src/transformers/models/gemma/configuration_gemma.py
@ -1,7 +1,7 @@
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 #               This file was automatically generated from <path_to_diff_file.py>.
 #         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the diff. If any change should be done, please apply the change to the
+#         the file from the diff. If any change should be done, please apply the change to the 
 #                                    diff.py file directly.
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
@ -21,6 +21,7 @@
 # limitations under the License.


+
 from transformers import PretrainedConfig


@ -29,24 +30,22 @@ class GemmaConfig(PretrainedConfig):
    This is the configuration class to store the configuration of a [`GemmaModel`]. It is used to instantiate an Gemma
    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
    defaults will yield a similar configuration to that of the Gemma-7B.
-
+    e.g. [google/gemma-7b](https://huggingface.co/google/gemma-7b)
    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.
-
-
    Args:
-        vocab_size (`int`, *optional*, defaults to 32000):
+        vocab_size (`int`, *optional*, defaults to 256000):
            Vocabulary size of the Gemma model. Defines the number of different tokens that can be represented by the
            `inputs_ids` passed when calling [`GemmaModel`]
-        hidden_size (`int`, *optional*, defaults to 4096):
+        hidden_size (`int`, *optional*, defaults to 3072):
            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 11008):
+        intermediate_size (`int`, *optional*, defaults to 24576):
            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 32):
+        num_hidden_layers (`int`, *optional*, defaults to 28):
            Number of hidden layers in the Transformer decoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
+        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer decoder.
-        num_key_value_heads (`int`, *optional*):
+        num_key_value_heads (`int`, *optional*, defaults to 16):
            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
@ -54,11 +53,15 @@ class GemmaConfig(PretrainedConfig):
            by meanpooling all the original heads within that group. For more details checkout [this
            paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
            `num_attention_heads`.
-        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 2048):
-            The maximum sequence length that this model might ever be used with. Gemma 1 supports up to 2048 tokens,
-            Gemma 2 up to 4096, CodeGemma up to 16384.
+        head_dim (`int`, *optional*, defaults to 256):
+            The attention head dimension.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The legacy activation function. It is overwritten by the `hidden_activation`.
+        hidden_activation (`str` or `function`, *optional*):
+            The non-linear activation function (function or string) in the decoder. Will default to `"gelu_pytorch_tanh"`
+            if not specified. `"gelu_pytorch_tanh"` uses an approximation of the `"gelu"` activation function.
+        max_position_embeddings (`int`, *optional*, defaults to 8192):
+            The maximum sequence length that this model might ever be used with.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        rms_norm_eps (`float`, *optional*, defaults to 1e-06):
@ -66,45 +69,26 @@ class GemmaConfig(PretrainedConfig):
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models). Only
            relevant if `config.is_decoder=True`.
-        pad_token_id (`int`, *optional*):
+        pad_token_id (`int`, *optional*, defaults to 0):
            Padding token id.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 2):
+        eos_token_id (`int`, *optional*, defaults to 1):
            End of stream token id.
-        pretraining_tp (`int`, *optional*, defaults to 1):
-            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
-            document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to understand more about it. This value is
-            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
-            issue](https://github.com/pytorch/pytorch/issues/76232).
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+        bos_token_id (`int`, *optional*, defaults to 2):
+            Beginning of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
            Whether to tie weight embeddings
        rope_theta (`float`, *optional*, defaults to 10000.0):
            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
-            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalGemma/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        attention_bias (`bool`, *optional*, defaults to `False`):
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
            Whether to use a bias in the query, key, value and output projection layers during self-attention.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
-        mlp_bias (`bool`, *optional*, defaults to `False`):
-            Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers.
-
    ```python
    >>> from transformers import GemmaModel, GemmaConfig
-
    >>> # Initializing a Gemma gemma-7b style configuration
    >>> configuration = GemmaConfig()
-
    >>> # Initializing a model from the gemma-7b style configuration
    >>> model = GemmaModel(configuration)
-
    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```"""
@ -134,6 +118,7 @@ class GemmaConfig(PretrainedConfig):
        rope_theta=10000.0,
        attention_bias=False,
        attention_dropout=0.0,
+        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.max_position_embeddings = max_position_embeddings
@ -141,23 +126,16 @@ class GemmaConfig(PretrainedConfig):
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads
-
-        # for backward compatibility
-        if num_key_value_heads is None:
-            num_key_value_heads = num_attention_heads
-
+        self.head_dim = head_dim
        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
+        self.hidden_activation = hidden_activation
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
-        self.pretraining_tp = pretraining_tp
        self.use_cache = use_cache
        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self._rope_scaling_validation()
        self.attention_bias = attention_bias
        self.attention_dropout = attention_dropout
-        self.mlp_bias = mlp_bias

        super().__init__(
            pad_token_id=pad_token_id,
@ -166,23 +144,3 @@ class GemmaConfig(PretrainedConfig):
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with two fields, `type` and `factor`, " f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
--- a/src/transformers/models/gemma/diff_gemma.py
+++ b/src/transformers/models/gemma/diff_gemma.py
@ -23,7 +23,6 @@ import torch.utils.checkpoint
 from torch import nn

 from transformers import PretrainedConfig
-from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import (
    LlamaForCausalLM,
    LlamaForSequenceClassification,
@ -163,35 +162,6 @@ class GemmaConfig(PretrainedConfig):
            **kwargs,
        )

-
-# Example where we only want to overwrite the defaults of an init?
-class GemmaConfig(LlamaConfig):
-    def __init__(
-        self,
-        vocab_size=256000,
-        hidden_size=3072,
-        intermediate_size=24576,
-        num_hidden_layers=28,
-        num_attention_heads=16,
-        num_key_value_heads=16,
-        head_dim=256,
-        hidden_act="gelu_pytorch_tanh",
-        hidden_activation=None,
-        max_position_embeddings=8192,
-        initializer_range=0.02,
-        rms_norm_eps=1e-6,
-        use_cache=True,
-        pad_token_id=0,
-        eos_token_id=1,
-        bos_token_id=2,
-        tie_word_embeddings=True,
-        rope_theta=10000.0,
-        attention_bias=False,
-        attention_dropout=0.0,
-    ):
-        super().__init__(self)
-
-
 class GemmaRMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
--- a/src/transformers/models/gemma/modeling_gemma.py
+++ b/src/transformers/models/gemma/modeling_gemma.py
@ -1,7 +1,7 @@
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 #               This file was automatically generated from <path_to_diff_file.py>.
 #         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the diff. If any change should be done, please apply the change to the
+#         the file from the diff. If any change should be done, please apply the change to the 
 #                                    diff.py file directly.
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
@ -86,7 +86,6 @@ def _get_unpad_data(attention_mask):
        max_seqlen_in_batch,
    )

-
 class GemmaRMSNorm(nn.Module):
    def __init__(self, dim: int, eps: float = 1e-6):
        super().__init__()
--- a/src/transformers/models/llama/modeling_llama.py
+++ b/src/transformers/models/llama/modeling_llama.py
@ -1,7 +1,7 @@
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 #               This file was automatically generated from <path_to_diff_file.py>.
 #         Do NOT edit this file manually as any edits will be overwritten by the generation of
-#         the file from the diff. If any change should be done, please apply the change to the
+#         the file from the diff. If any change should be done, please apply the change to the 
 #                                    diff.py file directly.
 #           🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
 # coding=utf-8
--- a/utils/diff_model_converter.py
+++ b/utils/diff_model_converter.py
@ -189,6 +189,7 @@ DOCSTRING_NODE = m.SimpleStatementLine(
    body=[
        m.Expr(
            value=m.SimpleString(
+                # match anything between """ """
                value=m.MatchIfTrue(lambda value: re.search(r"\"\"\"[\s\S]*\"\"\"", value) is not None)
            )
        )
@ -290,12 +291,7 @@ def replace_call_to_super(class_finder: ClassFinder, updated_node: cst.ClassDef,
                                                                            |     ```
    """
    original_node = class_finder.classes[class_name]
-
-    # TODO here is where we merge stuff from super. We can choose to merge the docstring as well!
-    # We could also check the docstring here
    original_methods = {f.name.value if hasattr(f, "name") else f: f for f in original_node.body.body}
-
-    # Copy methods from original node to replacement node, preserving decorators
    updated_methods = {f.name.value if hasattr(f, "name") else f: f for f in updated_node.body.body}
    end_meth = []
    for name, func in original_methods.items():