From 5f572241b4dee3622dc31584ad682abf496fef2f Mon Sep 17 00:00:00 2001
From: Joshua Lochner <admin@xenova.com>
Date: Wed, 6 Mar 2024 13:08:28 +0200
Subject: [PATCH] Add support for `UniSpeech` and `UniSpeechSat` models (#624)

* Add support for `UniSpeech` and `UniSpeechSat` models

* Remove listed support for unispeech STT models

* Align unispeech w/ existing wav2vec logic
---
 README.md                                |   2 +
 docs/snippets/6_supported-models.snippet |   2 +
 scripts/convert.py                       |  16 +++-
 scripts/supported_models.py              |  50 +++++++++++
 src/models.js                            | 103 ++++++++++++++++++++++-
 src/pipelines.js                         |   2 +
 6 files changed, 173 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 4fcaf68..a5e51fe 100644
--- a/README.md
+++ b/README.md
@@ -341,6 +341,8 @@ You can refine your search by selecting the task you're interested in (e.g., [te
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
diff --git a/docs/snippets/6_supported-models.snippet b/docs/snippets/6_supported-models.snippet
index ed2c1bd..eed000b 100644
--- a/docs/snippets/6_supported-models.snippet
+++ b/docs/snippets/6_supported-models.snippet
@@ -76,6 +76,8 @@
 1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
 1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
diff --git a/scripts/convert.py b/scripts/convert.py
index 5b86204..d538439 100644
--- a/scripts/convert.py
+++ b/scripts/convert.py
@@ -103,6 +103,18 @@ MODEL_SPECIFIC_QUANTIZE_PARAMS = {
         'per_channel': False,
         'reduce_range': False,
     },
+    'wav2vec2': {
+        'per_channel': False,
+        'reduce_range': False,
+    },
+    'unispeech': {
+        'per_channel': False,
+        'reduce_range': False,
+    },
+    'unispeech-sat': {
+        'per_channel': False,
+        'reduce_range': False,
+    },
 }
 
 MODELS_WITHOUT_TOKENIZERS = [
@@ -110,6 +122,8 @@ MODELS_WITHOUT_TOKENIZERS = [
     'wav2vec2-bert',
     'wavlm',
     'hubert',
+    'unispeech',
+    'unispeech-sat',
 ]
 
 
@@ -386,7 +400,7 @@ def main():
                 **get_main_export_kwargs(config, "automatic-speech-recognition")
             )
 
-    elif config.model_type in ('wav2vec2', 'wav2vec2-bert', 'hubert'):
+    elif config.model_type in ('wav2vec2', 'wav2vec2-bert', 'hubert', 'unispeech' , 'unispeech-sat'):
         if tokenizer is not None:
             from .extra.wav2vec2 import generate_tokenizer_json
             tokenizer_json = generate_tokenizer_json(tokenizer)
diff --git a/scripts/supported_models.py b/scripts/supported_models.py
index 7d7a5c1..7a2fd62 100644
--- a/scripts/supported_models.py
+++ b/scripts/supported_models.py
@@ -924,6 +924,45 @@ SUPPORTED_MODELS = {
             'microsoft/trocr-base-handwritten',
         ],
     },
+    'unispeech': {
+        # Feature extraction
+        'feature-extraction': [
+            # Requires --task feature-extraction
+            'microsoft/unispeech-large-1500h-cv',
+        ],
+        # TODO: add support for
+        # # Automatic speech recognition
+        # 'automatic-speech-recognition': [
+        #     'microsoft/unispeech-1350-en-353-fr-ft-1h',
+        #     'microsoft/unispeech-1350-en-17h-ky-ft-1h',
+        #     'microsoft/unispeech-1350-en-90-it-ft-1h',
+        #     'microsoft/unispeech-1350-en-168-es-ft-1h',
+        # ],
+    },
+    'unispeech-sat': {
+        # Feature extraction
+        'feature-extraction': [
+            # Requires --task feature-extraction
+            'microsoft/unispeech-sat-base',
+        ],
+
+        # Audio XVector (e.g., for speaker verification)
+        'audio-xvector': [
+            'microsoft/unispeech-sat-base-plus-sv',
+            'microsoft/unispeech-sat-base-sv',
+            'microsoft/unispeech-sat-large-sv',
+        ],
+
+        # Audio frame classification
+        'audio-frame-classification': [
+            'microsoft/unispeech-sat-base-plus-sd',
+        ],
+
+        # Automatic speech recognition
+        'automatic-speech-recognition': [
+            'microsoft/unispeech-sat-base-100h-libri-ft',
+        ],
+    },
     'vision-encoder-decoder': {
         # Image-to-text
         'image-to-text': [
@@ -993,6 +1032,11 @@ SUPPORTED_MODELS = {
             'facebook/mms-lid-4017',
         ],
 
+        # Audio frame classification
+        'audio-frame-classification': [
+            'anton-l/wav2vec2-base-superb-sd',
+        ],
+
         # Automatic speech recognition
         'automatic-speech-recognition': [
             'jonatasgrosman/wav2vec2-large-xlsr-53-english',
@@ -1020,6 +1064,12 @@ SUPPORTED_MODELS = {
             'microsoft/wavlm-large',
         ],
 
+        # Audio frame classification
+        'audio-frame-classification': [
+            'anton-l/wav2vec2-base-superb-sd',
+            'microsoft/wavlm-base-plus-sd',
+        ],
+
         # Audio XVector (e.g., for speaker verification)
         'audio-xvector': [
             'microsoft/wavlm-base-plus-sv',
diff --git a/src/models.js b/src/models.js
index cf78073..ec46120 100644
--- a/src/models.js
+++ b/src/models.js
@@ -4574,7 +4574,97 @@ export class Wav2Vec2ForSequenceClassification extends Wav2Vec2PreTrainedModel {
 //////////////////////////////////////////////////
 
 //////////////////////////////////////////////////
-// Wav2Vec2 models
+// UniSpeech models
+export class UniSpeechPreTrainedModel extends PreTrainedModel { };
+
+/**
+ * The bare UniSpeech Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class UniSpeechModel extends UniSpeechPreTrainedModel { }
+
+/**
+ * UniSpeech Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class UniSpeechForCTC extends UniSpeechPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeech Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class UniSpeechForSequenceClassification extends UniSpeechPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+// UniSpeechSat models
+export class UniSpeechSatPreTrainedModel extends PreTrainedModel { };
+
+/**
+ * The bare UniSpeechSat Model transformer outputting raw hidden-states without any specific head on top.
+ */
+export class UniSpeechSatModel extends UniSpeechSatPreTrainedModel { }
+
+/**
+ * UniSpeechSat Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+ */
+export class UniSpeechSatForCTC extends UniSpeechSatPreTrainedModel {
+    /**
+     * @param {Object} model_inputs
+     * @param {Tensor} model_inputs.input_values Float values of input raw speech waveform.
+     * @param {Tensor} model_inputs.attention_mask Mask to avoid performing convolution and attention on padding token indices. Mask values selected in [0, 1]
+     */
+    async _call(model_inputs) {
+        return new CausalLMOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeechSat Model with a sequence classification head on top (a linear layer over the pooled output).
+ */
+export class UniSpeechSatForSequenceClassification extends UniSpeechSatPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<SequenceClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new SequenceClassifierOutput(await super._call(model_inputs));
+    }
+}
+
+/**
+ * UniSpeechSat Model with a frame classification head on top for tasks like Speaker Diarization.
+ */
+export class UniSpeechSatForAudioFrameClassification extends UniSpeechSatPreTrainedModel {
+    /**
+     * Calls the model on new inputs.
+     * @param {Object} model_inputs The inputs to the model.
+     * @returns {Promise<TokenClassifierOutput>} An object containing the model's output logits for sequence classification.
+     */
+    async _call(model_inputs) {
+        return new TokenClassifierOutput(await super._call(model_inputs));
+    }
+}
+//////////////////////////////////////////////////
+
+//////////////////////////////////////////////////
+// Wav2Vec2Bert models
 export class Wav2Vec2BertPreTrainedModel extends PreTrainedModel { };
 
 /**
@@ -5289,6 +5379,8 @@ const MODEL_MAPPING_NAMES_ENCODER_ONLY = new Map([
     ['squeezebert', ['SqueezeBertModel', SqueezeBertModel]],
     ['wav2vec2', ['Wav2Vec2Model', Wav2Vec2Model]],
     ['wav2vec2-bert', ['Wav2Vec2BertModel', Wav2Vec2BertModel]],
+    ['unispeech', ['UniSpeechModel', UniSpeechModel]],
+    ['unispeech-sat', ['UniSpeechSatModel', UniSpeechSatModel]],
     ['hubert', ['HubertModel', HubertModel]],
     ['wavlm', ['WavLMModel', WavLMModel]],
     ['audio-spectrogram-transformer', ['ASTModel', ASTModel]],
@@ -5514,6 +5606,8 @@ const MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = new Map([
 const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
     ['wav2vec2', ['Wav2Vec2ForCTC', Wav2Vec2ForCTC]],
     ['wav2vec2-bert', ['Wav2Vec2BertForCTC', Wav2Vec2BertForCTC]],
+    ['unispeech', ['UniSpeechForCTC', UniSpeechForCTC]],
+    ['unispeech-sat', ['UniSpeechSatForCTC', UniSpeechSatForCTC]],
     ['wavlm', ['WavLMForCTC', WavLMForCTC]],
     ['hubert', ['HubertForCTC', HubertForCTC]],
 ]);
@@ -5521,6 +5615,8 @@ const MODEL_FOR_CTC_MAPPING_NAMES = new Map([
 const MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = new Map([
     ['wav2vec2', ['Wav2Vec2ForSequenceClassification', Wav2Vec2ForSequenceClassification]],
     ['wav2vec2-bert', ['Wav2Vec2BertForSequenceClassification', Wav2Vec2BertForSequenceClassification]],
+    ['unispeech', ['UniSpeechForSequenceClassification', UniSpeechForSequenceClassification]],
+    ['unispeech-sat', ['UniSpeechSatForSequenceClassification', UniSpeechSatForSequenceClassification]],
     ['wavlm', ['WavLMForSequenceClassification', WavLMForSequenceClassification]],
     ['hubert', ['HubertForSequenceClassification', HubertForSequenceClassification]],
     ['audio-spectrogram-transformer', ['ASTForAudioClassification', ASTForAudioClassification]],
@@ -5530,6 +5626,10 @@ const MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = new Map([
     ['wavlm', ['WavLMForXVector', WavLMForXVector]],
 ]);
 
+const MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = new Map([
+    ['unispeech-sat', ['UniSpeechSatForAudioFrameClassification', UniSpeechSatForAudioFrameClassification]],
+]);
+
 const MODEL_FOR_IMAGE_MATTING_MAPPING_NAMES = new Map([
     ['vitmatte', ['VitMatteForImageMatting', VitMatteForImageMatting]],
 ]);
@@ -5571,6 +5671,7 @@ const MODEL_CLASS_TYPE_MAPPING = [
     [MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES, MODEL_TYPES.Seq2Seq],
     [MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
     [MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
+    [MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES, MODEL_TYPES.EncoderOnly],
 ];
 
 for (const [mappings, type] of MODEL_CLASS_TYPE_MAPPING) {
diff --git a/src/pipelines.js b/src/pipelines.js
index a9af251..25dfb58 100644
--- a/src/pipelines.js
+++ b/src/pipelines.js
@@ -1526,6 +1526,8 @@ export class AutomaticSpeechRecognitionPipeline extends (/** @type {new (options
                 return this._call_whisper(audio, kwargs)
             case 'wav2vec2':
             case 'wav2vec2-bert':
+            case 'unispeech':
+            case 'unispeech-sat':
             case 'hubert':
                 return this._call_wav2vec2(audio, kwargs)
             default: