transformers.js/scripts/extra/clip.py

32 lines
1.0 KiB
Python

# Support exporting vision and text models separately:
# Adapted from https://github.com/huggingface/optimum/issues/1186#issuecomment-1637641760
from optimum.exporters.onnx.model_configs import CLIPTextOnnxConfig, ViTOnnxConfig
from typing import Dict
class CLIPVisionOnnxConfig(ViTOnnxConfig):
pass
class CLIPTextModelWithProjectionOnnxConfig(CLIPTextOnnxConfig):
@property
def outputs(self) -> Dict[str, Dict[int, str]]:
return {
"text_embeds": {0: "batch_size"},
}
def generate_dummy_inputs(self, framework: str = "pt", **kwargs):
dummy_inputs = super().generate_dummy_inputs(framework=framework, **kwargs)
if framework == "pt":
import torch
dummy_inputs["input_ids"] = dummy_inputs["input_ids"].to(dtype=torch.int64)
return dummy_inputs
class CLIPVisionModelWithProjectionOnnxConfig(CLIPVisionOnnxConfig):
@property
def outputs(self) -> Dict[str, Dict[int, str]]:
return {
"image_embeds": {0: "batch_size"},
}