transformers.js/scripts/convert.py


import json
import os
import shutil
from dataclasses import dataclass, field
from typing import Optional, Set
from tqdm import tqdm

from transformers import (
    AutoConfig,
    AutoTokenizer,
    HfArgumentParser
)
from transformers.utils import cached_file

import onnx
from optimum.exporters.onnx import main_export
from optimum.exporters.tasks import TasksManager
from onnxruntime.quantization import (
    quantize_dynamic,
    QuantType
)


@dataclass
class ConversionArguments:
    """
    Arguments used for converting HuggingFace models to onnx.
    """

    model_id: str = field(
        metadata={
            "help": "Model identifier"
        }
    )
    quantize: bool = field(
        default=False,
        metadata={
            "help": "Whether to quantize the model."
        }
    )
    output_parent_dir: str = field(
        default='./models/',
        metadata={
            "help": "Path where the converted model will be saved to."
        }
    )

    task: Optional[str] = field(
        default='auto',
        metadata={
            "help": (
                "The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"
                f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
            )
        }
    )

    opset: int = field(
        default=None,
        metadata={
            "help": (
                "If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."
            )
        }
    )

    device: str = field(
        default='cpu',
        metadata={
            "help": 'The device to use to do the export.'
        }
    )
    skip_validation: bool = field(
        default=False,
        metadata={
            "help": "Whether to skip validation of the converted model"
        }
    )

    per_channel: bool = field(
        default=True,
        metadata={
            "help": "Whether to quantize weights per channel"
        }
    )
    reduce_range: bool = field(
        default=True,
        metadata={
            "help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"
        }
    )


def get_operators(model: onnx.ModelProto) -> Set[str]:
    operators = set()

    def traverse_graph(graph):
        for node in graph.node:
            operators.add(node.op_type)
            for attr in node.attribute:
                if attr.type == onnx.AttributeProto.GRAPH:
                    subgraph = attr.g
                    traverse_graph(subgraph)

    traverse_graph(model.graph)
    return operators


def quantize(model_names_or_paths, conv_args: ConversionArguments):
    """
    Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU

    Uses unsigned ints for activation values, signed ints for weights, per
    https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection
    it is faster on most CPU architectures
    Args:
        onnx_model_path: Path to location the exported ONNX model is stored
    Returns: The Path generated for the quantized
    """

    quant_config = dict(
        per_channel=conv_args.per_channel,
        reduce_range=conv_args.reduce_range,
        per_model_config={}
    )

    for model in tqdm(model_names_or_paths, desc='Quantizing'):
        directory_path = os.path.dirname(model)
        file_name_without_extension = os.path.splitext(
            os.path.basename(model))[0]

        # NOTE:
        # As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.
        # For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.
        #
        # As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.
        # For more information, see:
        #  - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621
        #  - https://github.com/microsoft/onnxruntime/issues/2339

        loaded_model = onnx.load_model(model)
        op_types = get_operators(loaded_model)
        weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8

        quantize_dynamic(
            model_input=model,
            model_output=os.path.join(
                directory_path, f'{file_name_without_extension}_quantized.onnx'),

            per_channel=conv_args.per_channel,
            reduce_range=conv_args.reduce_range,

            weight_type=weight_type,
            optimize_model=False,

            # TODO allow user to specify these
            # op_types_to_quantize=['MatMul', 'Add', 'Conv'],
            extra_options=dict(
                EnableSubgraph=True
            )
        )

        quant_config['per_model_config'][file_name_without_extension] = dict(
            op_types=list(op_types),
            weight_type=str(weight_type),
        )

    # Save quantization config
    with open(os.path.join(directory_path, 'quant_config.json'), 'w') as fp:
        json.dump(quant_config, fp, indent=4)


def copy_if_exists(model_path, file_name, destination):
    file = cached_file(model_path, file_name,
                       _raise_exceptions_for_missing_entries=False)
    if file is not None:
        shutil.copy(file, destination)


def main():

    parser = HfArgumentParser(
        (ConversionArguments, )
    )
    conv_args, = parser.parse_args_into_dataclasses()

    model_id = conv_args.model_id

    output_model_folder = os.path.join(conv_args.output_parent_dir, model_id)

    # Create output folder
    os.makedirs(output_model_folder, exist_ok=True)

    # Copy certain JSON files, which save_pretrained doesn't handle
    # copy_if_exists(model_id, 'tokenizer.json', output_model_folder)

    # copy_if_exists(model_id, 'preprocessor_config.json', output_model_folder)
    # copy_if_exists(model_id, 'generation_config.json', output_model_folder)

    # # Saving the model config
    config = AutoConfig.from_pretrained(model_id)
    # config.save_pretrained(output_model_folder)

    try:
        # Save tokenizer
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        # tokenizer.save_pretrained(output_model_folder)

        # Handle special cases
        if config.model_type == 'marian':
            import json
            from .extra.marian import generate_tokenizer_json
            tokenizer_json = generate_tokenizer_json(model_id, tokenizer)

            with open(os.path.join(output_model_folder, 'tokenizer.json'), 'w', encoding='utf-8') as fp:
                json.dump(tokenizer_json, fp)

    except KeyError:
        pass  # No Tokenizer

    # Step 1. convert huggingface model to onnx
    main_export(
        model_name_or_path=model_id,
        output=output_model_folder,
        task=conv_args.task,
        opset=conv_args.opset,
        device=conv_args.device,
        do_validation=not conv_args.skip_validation,
    )

    # Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.
    if conv_args.quantize:
        quantize([
            os.path.join(output_model_folder, x)
            for x in os.listdir(output_model_folder)
            if x.endswith('.onnx') and not x.endswith('_quantized.onnx')
        ], conv_args)

    # Step 3. Move .onnx files to the 'onnx' subfolder
    os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)
    for file in os.listdir(output_model_folder):
        if file.endswith('.onnx') or file.endswith('.onnx_data'):
            shutil.move(os.path.join(output_model_folder, file),
                        os.path.join(output_model_folder, 'onnx', file))


if __name__ == '__main__':
    main()
Add conversion script for models 2023-02-16 06:11:49 +08:00
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`import json`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`import os`
Update conversion script to copy certain JSON files to destination (#11) 2023-03-10 21:07:18 +08:00			`import shutil`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`from dataclasses import dataclass, field`
Improvements to documentation (#172) * link to the conversion Space for maximum simplicity * add some types to script (very optional) * typo * no need for trailing slash here * Node is also a valid option * Document how to find a compatible checkpoint on the hub * Update README * Fix typing * Update docs index --------- Co-authored-by: Julien Chaumond <julien@huggingface.co> 2023-06-30 01:32:17 +08:00			`from typing import Optional, Set`
Remove unused imports 2023-04-22 05:20:54 +08:00			`from tqdm import tqdm`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`from transformers import (`
			`AutoConfig,`
			`AutoTokenizer,`
			`HfArgumentParser`
			`)`
Update conversion script to copy certain JSON files to destination (#11) 2023-03-10 21:07:18 +08:00			`from transformers.utils import cached_file`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`import onnx`
			`from optimum.exporters.onnx import main_export`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`from optimum.exporters.tasks import TasksManager`
Add `squeezebert` to unsigned model types 2023-03-29 21:48:14 +08:00			`from onnxruntime.quantization import (`
			`quantize_dynamic,`
			`QuantType`
			`)`
Add conversion script for models 2023-02-16 06:11:49 +08:00

			`@dataclass`
			`class ConversionArguments:`
			`"""`
			`Arguments used for converting HuggingFace models to onnx.`
			`"""`

			`model_id: str = field(`
			`metadata={`
			`"help": "Model identifier"`
			`}`
			`)`
			`quantize: bool = field(`
			`default=False,`
			`metadata={`
			`"help": "Whether to quantize the model."`
			`}`
			`)`
			`output_parent_dir: str = field(`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`default='./models/',`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`metadata={`
			`"help": "Path where the converted model will be saved to."`
			`}`
			`)`

			`task: Optional[str] = field(`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`default='auto',`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`metadata={`
			`"help": (`
			`"The task to export the model for. If not specified, the task will be auto-inferred based on the model. Available tasks depend on the model, but are among:"`
			f" {str(list(TasksManager._TASKS_TO_AUTOMODELS.keys()))}. For decoder models, use `xxx-with-past` to export the model using past key values in the decoder."
			`)`
			`}`
			`)`

			`opset: int = field(`
			`default=None,`
			`metadata={`
			`"help": (`
			`"If specified, ONNX opset version to export the model with. Otherwise, the default opset will be used."`
			`)`
			`}`
			`)`

			`device: str = field(`
			`default='cpu',`
			`metadata={`
Update conversion script help command 2023-04-02 19:18:30 +08:00			`"help": 'The device to use to do the export.'`
Add conversion script for models 2023-02-16 06:11:49 +08:00			`}`
			`)`
Add `--skip_validation` option to conversion script 2023-05-31 17:59:20 +08:00			`skip_validation: bool = field(`
			`default=False,`
			`metadata={`
			`"help": "Whether to skip validation of the converted model"`
			`}`
			`)`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`per_channel: bool = field(`
			`default=True,`
			`metadata={`
			`"help": "Whether to quantize weights per channel"`
			`}`
			`)`
			`reduce_range: bool = field(`
			`default=True,`
			`metadata={`
			`"help": "Whether to quantize weights with 7-bits. It may improve the accuracy for some models running on non-VNNI machine, especially for per-channel mode"`
			`}`
			`)`


Improvements to documentation (#172) * link to the conversion Space for maximum simplicity * add some types to script (very optional) * typo * no need for trailing slash here * Node is also a valid option * Document how to find a compatible checkpoint on the hub * Update README * Fix typing * Update docs index --------- Co-authored-by: Julien Chaumond <julien@huggingface.co> 2023-06-30 01:32:17 +08:00			`def get_operators(model: onnx.ModelProto) -> Set[str]:`
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`operators = set()`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`def traverse_graph(graph):`
			`for node in graph.node:`
			`operators.add(node.op_type)`
			`for attr in node.attribute:`
			`if attr.type == onnx.AttributeProto.GRAPH:`
			`subgraph = attr.g`
			`traverse_graph(subgraph)`

			`traverse_graph(model.graph)`
			`return operators`


			`def quantize(model_names_or_paths, conv_args: ConversionArguments):`
Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00			`"""`
			`Quantize the weights of the model from float32 to int8 to allow very efficient inference on modern CPU`

			`Uses unsigned ints for activation values, signed ints for weights, per`
			`https://onnxruntime.ai/docs/performance/quantization.html#data-type-selection`
			`it is faster on most CPU architectures`
			`Args:`
			`onnx_model_path: Path to location the exported ONNX model is stored`
			`Returns: The Path generated for the quantized`
			`"""`

Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`quant_config = dict(`
			`per_channel=conv_args.per_channel,`
			`reduce_range=conv_args.reduce_range,`
			`per_model_config={}`
			`)`

Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`for model in tqdm(model_names_or_paths, desc='Quantizing'):`
			`directory_path = os.path.dirname(model)`
			`file_name_without_extension = os.path.splitext(`
			`os.path.basename(model))[0]`
Fix whisper quantization 2023-03-08 05:53:50 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`# NOTE:`
			`# As of 2023/04/20, the current latest version of onnxruntime-web is 1.14.0, and does not support INT8 weights for Conv layers.`
			`# For this reason, we choose model weight types to ensure compatibility with onnxruntime-web.`
			`#`
			`# As per docs, signed weight type (QInt8) is faster on most CPUs, so, we use that unless the model contains a Conv layer.`
			`# For more information, see:`
			`# - https://github.com/microsoft/onnxruntime/issues/3130#issuecomment-1105200621`
			`# - https://github.com/microsoft/onnxruntime/issues/2339`

Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`loaded_model = onnx.load_model(model)`
			`op_types = get_operators(loaded_model)`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`weight_type = QuantType.QUInt8 if 'Conv' in op_types else QuantType.QInt8`
Fix whisper quantization 2023-03-08 05:53:50 +08:00
Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00			`quantize_dynamic(`
			`model_input=model,`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`model_output=os.path.join(`
			`directory_path, f'{file_name_without_extension}_quantized.onnx'),`
Update default quantization params in conversion script 2023-05-30 20:57:44 +08:00
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`per_channel=conv_args.per_channel,`
			`reduce_range=conv_args.reduce_range,`
Fix whisper quantization 2023-03-08 05:53:50 +08:00
			`weight_type=weight_type,`
Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00			`optimize_model=False,`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00
			`# TODO allow user to specify these`
			`# op_types_to_quantize=['MatMul', 'Add', 'Conv'],`
			`extra_options=dict(`
			`EnableSubgraph=True`
			`)`
			`)`
Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`quant_config['per_model_config'][file_name_without_extension] = dict(`
			`op_types=list(op_types),`
			`weight_type=str(weight_type),`
			`)`

			`# Save quantization config`
			`with open(os.path.join(directory_path, 'quant_config.json'), 'w') as fp:`
			`json.dump(quant_config, fp, indent=4)`

Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00
Update conversion script to copy certain JSON files to destination (#11) 2023-03-10 21:07:18 +08:00			`def copy_if_exists(model_path, file_name, destination):`
			`file = cached_file(model_path, file_name,`
			`_raise_exceptions_for_missing_entries=False)`
			`if file is not None:`
			`shutil.copy(file, destination)`


Add conversion script for models 2023-02-16 06:11:49 +08:00			`def main():`

			`parser = HfArgumentParser(`
			`(ConversionArguments, )`
			`)`
			`conv_args, = parser.parse_args_into_dataclasses()`

Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`model_id = conv_args.model_id`
Update conversion script 2023-03-02 01:12:32 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`output_model_folder = os.path.join(conv_args.output_parent_dir, model_id)`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Update conversion script to copy certain JSON files to destination (#11) 2023-03-10 21:07:18 +08:00			`# Create output folder`
			`os.makedirs(output_model_folder, exist_ok=True)`

			`# Copy certain JSON files, which save_pretrained doesn't handle`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`# copy_if_exists(model_id, 'tokenizer.json', output_model_folder)`
Update conversion script to support merging decoders 2023-03-10 02:05:18 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`# copy_if_exists(model_id, 'preprocessor_config.json', output_model_folder)`
			`# copy_if_exists(model_id, 'generation_config.json', output_model_folder)`
Update conversion script to skip tokenizer if it doesn't exist 2023-03-14 07:28:22 +08:00
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`# # Saving the model config`
			`config = AutoConfig.from_pretrained(model_id)`
			`# config.save_pretrained(output_model_folder)`
Add conversion script for models 2023-02-16 06:11:49 +08:00
Update conversion script to skip tokenizer if it doesn't exist 2023-03-14 07:28:22 +08:00			`try:`
			`# Save tokenizer`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`tokenizer = AutoTokenizer.from_pretrained(model_id)`
			`# tokenizer.save_pretrained(output_model_folder)`
Update conversion script to support `marian` models 2023-04-02 01:32:15 +08:00
			`# Handle special cases`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`if config.model_type == 'marian':`
Update conversion script to support `marian` models 2023-04-02 01:32:15 +08:00			`import json`
Rename folder to "extra" 2023-04-06 03:09:42 +08:00			`from .extra.marian import generate_tokenizer_json`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`tokenizer_json = generate_tokenizer_json(model_id, tokenizer)`
Update conversion script to support `marian` models 2023-04-02 01:32:15 +08:00
			`with open(os.path.join(output_model_folder, 'tokenizer.json'), 'w', encoding='utf-8') as fp:`
			`json.dump(tokenizer_json, fp)`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00
Update conversion script to skip tokenizer if it doesn't exist 2023-03-14 07:28:22 +08:00			`except KeyError:`
			`pass # No Tokenizer`
Add conversion script for models 2023-02-16 06:11:49 +08:00
			`# Step 1. convert huggingface model to onnx`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`main_export(`
			`model_name_or_path=model_id,`
			`output=output_model_folder,`
			`task=conv_args.task,`
Update conversion script args 2023-05-31 21:13:19 +08:00			`opset=conv_args.opset,`
			`device=conv_args.device,`
Add `--skip_validation` option to conversion script 2023-05-31 17:59:20 +08:00			`do_validation=not conv_args.skip_validation,`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`)`
Use optimum for converting to onnx 2023-02-21 22:53:32 +08:00
Add conversion script for models 2023-02-16 06:11:49 +08:00			`# Step 2. (optional, recommended) quantize the converted model for fast inference and to reduce model size.`
			`if conv_args.quantize:`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`quantize([`
			`os.path.join(output_model_folder, x)`
			`for x in os.listdir(output_model_folder)`
			`if x.endswith('.onnx') and not x.endswith('_quantized.onnx')`
Allow user to set `per_channel` and `reduce_range` quantization params (#156) (#157) * Allow user to set `per_channel` and `reduce_range` quantization parameters (#156) Also save quantization options * Get operators of graph and subgraphs 2023-06-22 06:43:43 +08:00			`], conv_args)`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00
			`# Step 3. Move .onnx files to the 'onnx' subfolder`
			`os.makedirs(os.path.join(output_model_folder, 'onnx'), exist_ok=True)`
			`for file in os.listdir(output_model_folder):`
Also move `.onnx_data` files to the onnx folder 2023-04-28 03:52:05 +08:00			`if file.endswith('.onnx') or file.endswith('.onnx_data'):`
Improve conversion script Determine method to quantize based on supported operations 2023-04-21 00:56:25 +08:00			`shutil.move(os.path.join(output_model_folder, file),`
			`os.path.join(output_model_folder, 'onnx', file))`
Add conversion script for models 2023-02-16 06:11:49 +08:00

			`if __name__ == '__main__':`
			`main()`