transformers.js/scripts/extra/clap.py

41 lines
1.6 KiB
Python

# TODO: Enable once https://github.com/huggingface/optimum/pull/1552 is merged
# # Support exporting vision and text models separately:
# # Adapted from https://github.com/huggingface/optimum/issues/1186#issuecomment-1637641760
# from optimum.exporters.onnx.model_configs import CLAPTextWithProjectionOnnxConfig, AudioOnnxConfig
# from optimum.utils.normalized_config import NormalizedAudioConfig
# from optimum.utils.input_generators import DummyAudioInputGenerator
# from typing import Dict
# class ClapAudioModelWithProjectionOnnxConfig(AudioOnnxConfig):
# NORMALIZED_CONFIG_CLASS = NormalizedAudioConfig
# DUMMY_INPUT_GENERATOR_CLASSES = (DummyAudioInputGenerator, )
# @property
# def inputs(self) -> Dict[str, Dict[int, str]]:
# return {
# "input_features": {0: "audio_batch_size", 1: "num_channels", 2: "height", 3: "width"}, # As described in modeling_clap.py
# }
# @property
# def outputs(self) -> Dict[str, Dict[int, str]]:
# return {
# "audio_embeds": {0: "batch_size"},
# }
# class ClapTextModelWithProjectionOnnxConfig(CLAPTextWithProjectionOnnxConfig):
# @property
# def outputs(self) -> Dict[str, Dict[int, str]]:
# return {
# "text_embeds": {0: "batch_size"},
# }
# def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
# dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
# if framework == "pt":
# import torch
# dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int64)
# return dummy_inputs