transformers.js/tests/generate_tests.py

# Helper file to dynamically generate unit tests
# This is done by running the python Transformers library and comparing its outputs with ours.

import json
import os
from itertools import product

from transformers import AutoTokenizer, AutoConfig
import numpy as np

from scripts.supported_models import SUPPORTED_MODELS

# List of tokenizers where the model isn't yet supported, but the tokenizer is
ADDITIONAL_TOKENIZERS_TO_TEST = {
    'falcon': [
        'tiiuae/falcon-7b',
    ],
    "llama": [
        'hf-internal-testing/llama-tokenizer',  # Special tokens: normalized=true
        'Xenova/llama2-tokenizer',  # Special tokens: normalized=false
        'Xenova/llama2-chat-tokenizer',  # Special tokens: normalized=false
        'hf-internal-testing/llama-code-tokenizer',
    ],
    'mpt': [
        'mosaicml/mpt-7b',
    ],
    't5': [
        # TODO: Add back when https://github.com/huggingface/transformers/issues/26318 is fixed
        # 'Xenova/t5-tokenizer-new',
    ],
}

MODELS_TO_IGNORE = [
    # TODO: remove when https://github.com/huggingface/tokenizers/issues/251 is fixed
    'xlm',

    # TODO: remove when https://github.com/huggingface/transformers/issues/26018 is fixed
    'marian',

    # TODO: remove when https://github.com/huggingface/transformers/issues/26547 is fixed
    'speecht5',
]

TOKENIZERS_TO_IGNORE = [
    # TODO: remove when https://github.com/huggingface/transformers/pull/25478 is merged
    'facebook/m2m100_418M',

    # TODO: remove when https://github.com/huggingface/transformers/issues/28096 is addressed
    'RajuKandasamy/tamillama_tiny_30m',
]

MAX_TESTS = {
    'marian': 10,
}

TOKENIZER_TEST_DATA = {
    "shared": [
        "hello world",
        "Hello World",
        "How are you doing?",
        "You should've done this",
        "A\n'll !!to?'d''d of, can't.",
        "def main():\n\tpass",
        "This\n\nis\na\ntest.",
        "let a = obj.toString();\ntoString();",
        'Hi  Hello',
        "trailing space   ",
        "   leading space",
        "生活的真谛是",
        "The company was founded in 2016.",
        "test $1 R2 #3 €4 £5 ¥6 ₣7 ₹8 ₱9 test",
        "I bought an apple for $1.00 at the store.",
        "you…  ",
        "\u0079\u006F\u0075\u2026\u00A0\u00A0",
        "\u0079\u006F\u0075\u2026\u00A0\u00A0\u0079\u006F\u0075\u2026\u00A0\u00A0",
        "▁This ▁is ▁a ▁test ▁.",
        "weird \uFF5E edge \uFF5E case",

        # SentencePiece-specific test cases
        "<s>\n",
        " </s> test </s> ",
        "</s>test</s>",
    ],
    "custom_by_model_type": {
        "llama": [
            # Additional test-cases for the Llama tokenizer, adapted from
            # https://github.com/belladoreai/llama-tokenizer-js/blob/master/llama-tokenizer.js#L381-L452
            "grabbed",
            " grabbed",
            "           grabbed",
            "\n",
            " \n",
            "	tabs				out here",
            "\n\t\n",
            "ax\n####\nboo",
            "镇",
            "🦙",
            "🦙Ꙋ",
            "Ꙋ🦙",
            "The llama (/ˈlɑːmə/; 🦙Spanish pronunciation: [ˈʎama]) (Lama glama) is a domesticated South American " \
            "camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas " \
            "are social animals and live with others as a herd. Their wool is soft and contains only a small " \
            "amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they " \
            "can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the " \
            "past also spelled \"lama\" or \"glama\") was adopted by European settlers from native Peruvians.[4] " \
            "The ancestors of llamas are thought to have originated from the Great Plains of North America about " \
            "40 million years ago, and subsequently migrated to South America about three million years ago during " \
            "the Great American Interchange. By the end of the last ice age (10,000–12,000 years ago), camelids were " \
            "extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South " \
            "America and over 158,000 llamas and 100,000Ꙋ🦙 alpacas, descended from progenitors imported late in " \
            "the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. " \
            "The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to " \
            "Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the " \
            "end of time.[6]",
        ]
    },
    "custom": {
        "facebook/blenderbot_small-90M": [
            # Test special tokens
            "__start__hello world__end__",
            # The original (python) tokenizer simply joins by spaces (regardless of special tokens or not)
            "__start__ hey __end__"  # --> ... --> "__start__ hey __end__"
            "__start__hey __end__"  # --> ... --> "__start__ hey __end__"
        ],
        "tiiuae/falcon-7b": [
            "12 and 123 and 1234",  # Special case for splitting on 3 numbers
        ],
        "InstaDeepAI/nucleotide-transformer-500m-human-ref": [
            # Actual protein sequences
            "ATTCCGATTCCGATTCCG",
            "ATTTCTCTCTCTCTCTGAGATCGATCGATCGAT",

            # Special tokens
            "<unk><pad><mask><cls><eos><bos>",
        ],

        "distil-whisper/distil-small.en": [
            "   <|startoftranscript|> <|en|>   ",  # Tests lstrip+rstrip
        ],

        "Xenova/t5-tokenizer-new": [
            # Tests the new T5 tokenizer, which uses a different prepend_scheme for its pre_tokenizer:
            # tokenizer._tokenizer.pre_tokenizer = Metaspace(add_prefix_space = True, replacement = "▁", prepend_scheme = "first")
            # See https://github.com/huggingface/transformers/pull/26678 for more information.
            #  - Old (incorrect): ['▁Hey', '▁', '</s>', '▁', '.', '▁how', '▁are', '▁you']
            #  - New (correct):   ['▁Hey', '▁', '</s>', '.', '▁how', '▁are', '▁you']
            "Hey </s>. how are you",
        ],
    },
}

CHAT_MESSAGES_EXAMPLES = {
    'basic': [
        {"role": "user", "content": "Hello, how are you?"},
        {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
        {"role": "user", "content": "I'd like to show off how chat templating works!"},
    ],

    'system': [
        {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate"},
        {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
    ],

    'system + assistant': [
        {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate"},
        {"role": "user", "content": "Hello, how are you?"},
        {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
        {"role": "user", "content": "I'd like to show off how chat templating works!"},
    ],
}

TOKENIZERS_WITH_CHAT_TEMPLATES = {
    # https://huggingface.co/docs/transformers/main/en/chat_templating
    'Xenova/blenderbot-400M-distill': [
        'basic',
    ],

    'mistralai/Mistral-7B-Instruct-v0.1': [
        'basic',
    ],

    'HuggingFaceH4/zephyr-7b-beta': [
        'system',
    ],

    'Xenova/llama-tokenizer': [
        'basic',
        'system',
        'system + assistant',
    ],
    'Xenova/llama2-tokenizer': [
        'basic',
        'system',
        'system + assistant',
    ],
    'Xenova/llama2-chat-tokenizer': [
        'basic',
        'system',
        'system + assistant',
    ],
}


FLATTENED_SUPPORTED_MODELS = [
    (model_type, [
        model for task_models in tasks.values() for model in task_models
    ]) for model_type, tasks in SUPPORTED_MODELS.items()
]


def generate_tokenizer_tests():

    tokenization_results = {}

    tokenizers_to_test = FLATTENED_SUPPORTED_MODELS + \
        list(ADDITIONAL_TOKENIZERS_TO_TEST.items())

    for model_type, tokenizer_names in tokenizers_to_test:
        if model_type in MODELS_TO_IGNORE:
            continue
        if model_type in MAX_TESTS:
            tokenizer_names = tokenizer_names[:MAX_TESTS[model_type]]

        custom_by_model_type_texts = TOKENIZER_TEST_DATA["custom_by_model_type"].get(
            model_type, [])

        print(f'Generating tests for {model_type}')
        for tokenizer_name in tokenizer_names:
            if tokenizer_name in TOKENIZERS_TO_IGNORE:
                continue

            print('  -', tokenizer_name)

            try:
                # Load tokenizer
                if model_type == 'llama':
                    # As of 17/12/2023, there are a few issues with the Llama tokenizers in transformers.
                    # (1) Encoding with fast tokenizer adds whitespace after speical tokens:
                    #   - https://github.com/huggingface/transformers/issues/25881
                    #   - https://github.com/huggingface/transformers/issues/26318
                    #   - https://github.com/huggingface/transformers/issues/26455
                    #   - https://github.com/huggingface/transformers/issues/27544
                    # (2) Decoding with slow tokenizer adds whitespace after special tokens:
                    #   - https://github.com/huggingface/transformers/issues/25073
                    #
                    # So for now, we mix and match the tokenizers:
                    # i.e., use the fast tokenizer for encoding, and the slow tokenizer for decoding.
                    # TODO: remove when the above issues are fixed:
                    tokenizer = AutoTokenizer.from_pretrained(
                        tokenizer_name,
                        use_fast=False,
                    )
                    decoder_tokenizer = AutoTokenizer.from_pretrained(
                        tokenizer_name,
                        use_fast=True,
                    )

                else:
                    decoder_tokenizer = tokenizer = AutoTokenizer.from_pretrained(
                        tokenizer_name)

            except (KeyError, EnvironmentError):
                # If a KeyError/EnvironmentError is raised from the AutoTokenizer, it
                # means the model does not use a tokenizer (e.g., vision models)
                continue

            try:
                # Disable dropout, if the model allows it
                tokenizer.backend_tokenizer.model.dropout = 0
            except AttributeError:
                pass

            tokenizer_results = []

            shared_texts = TOKENIZER_TEST_DATA["shared"]
            custom_texts = TOKENIZER_TEST_DATA["custom"].get(
                tokenizer_name, [])

            # Run tokenizer on test cases
            for text in shared_texts + custom_texts + custom_by_model_type_texts:
                # TODO: add with_pair option
                try:
                    encoded = tokenizer(text).data
                except Exception:
                    # Ignore testing tokenizers which fail in the python library
                    continue

                decoded_with_special = decoder_tokenizer.decode(
                    encoded["input_ids"], skip_special_tokens=False)
                decoded_without_special = decoder_tokenizer.decode(
                    encoded["input_ids"], skip_special_tokens=True)

                tokenizer_results.append(dict(
                    input=text,
                    encoded=encoded,
                    decoded_with_special=decoded_with_special,
                    decoded_without_special=decoded_without_special,
                ))

            if tokenizer_results:
                tokenization_results[tokenizer_name] = tokenizer_results

    template_results = {}

    for tokenizer_id in TOKENIZERS_WITH_CHAT_TEMPLATES:
        print(f'Generating chat templates for {tokenizer_id}')
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_id,

            # TODO: Remove once https://github.com/huggingface/transformers/pull/26678 is fixed
            use_fast='llama' not in tokenizer_id,
        )
        tokenizer_results = []
        for key in TOKENIZERS_WITH_CHAT_TEMPLATES[tokenizer_id]:
            messages = CHAT_MESSAGES_EXAMPLES[key]

            for add_generation_prompt, tokenize in product([True, False], [True, False]):
                tokenizer_results.append(dict(
                    messages=messages,
                    add_generation_prompt=add_generation_prompt,
                    tokenize=tokenize,
                    target=tokenizer.apply_chat_template(
                        messages,
                        add_generation_prompt=add_generation_prompt,
                        tokenize=tokenize,
                    ),
                ))

        template_results[tokenizer_id] = tokenizer_results

    return dict(
        tokenization=tokenization_results,
        templates=template_results,
    )


def generate_config_tests():
    results = {}
    for model_type, config_names in FLATTENED_SUPPORTED_MODELS:
        print(f'Generating tests for {model_type}')

        for config_name in config_names:
            print('  -', config_name)
            try:
                # Load config
                config = AutoConfig.from_pretrained(config_name)
            except Exception:
                # Something went wrong, skip this config
                continue
            results[config_name] = config.to_dict()

            # TODO: Remove after https://github.com/huggingface/transformers/issues/23876 fixed
            results[config_name].pop('torch_dtype', None)

    return results


ARRAY_SIZES = sorted(set([2 ** i for i in range(1, 10)])
                     | set([3 ** i for i in range(1, 8)])
                     | set([5 ** i for i in range(1, 6)])
                     | set([7 ** i for i in range(1, 4)]))


def serialize_complex_array(arr):
    return [float(x) for y in arr for x in [y.real, y.imag]]


def serialize_real_array(arr):
    return arr.tolist()


def generate_fft_tests():
    np.random.seed(0)
    tests = {}
    for complex in [False, True]:
        serialize_fn = serialize_complex_array if complex else serialize_real_array
        for size in ARRAY_SIZES:
            arr = np.random.randn(size).astype(
                np.complex64 if complex else np.float64)
            if complex:
                arr += np.random.randn(size) * 1j
            tests[f"fft_{size}_{'complex' if complex else 'real'}"] = {
                "complex": complex,
                "input": serialize_fn(arr),
                "output": serialize_complex_array(np.fft.fft(arr)),
            }
    return tests


def main():
    # TODO add option to cache generated data + force build tests

    data_dir = os.path.join(
        os.path.dirname(os.path.abspath(__file__)), "data",
    )

    tokenizer_tests = generate_tokenizer_tests()
    with open(os.path.join(data_dir, "tokenizer_tests.json"), "w", encoding="utf-8") as fp:
        json.dump(tokenizer_tests, fp)

    config_tests = generate_config_tests()
    with open(os.path.join(data_dir, "config_tests.json"), "w", encoding="utf-8") as fp:
        json.dump(config_tests, fp)

    fft_tests = generate_fft_tests()
    with open(os.path.join(data_dir, "fft_tests.json"), "w", encoding="utf-8") as fp:
        json.dump(fft_tests, fp)


if __name__ == "__main__":
    main()