From ff3019fc052a8df575b3e30ff4b0e8e85a76255c Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Wed, 6 Dec 2023 22:17:11 +0200 Subject: [PATCH] Add example usage for `SpeechT5ForSpeechToText` (#438) --- src/models.js | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/models.js b/src/models.js index 1c55d88..48fab2f 100644 --- a/src/models.js +++ b/src/models.js @@ -3859,6 +3859,43 @@ export class SpeechT5Model extends SpeechT5PreTrainedModel { }; /** * SpeechT5 Model with a speech encoder and a text decoder. + * + * **Example:** Generate speech from text with `SpeechT5ForSpeechToText`. + * ```javascript + * import { AutoTokenizer, AutoProcessor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, Tensor } from '@xenova/transformers'; + * + * // Load the tokenizer and processor + * const tokenizer = await AutoTokenizer.from_pretrained('Xenova/speecht5_tts'); + * const processor = await AutoProcessor.from_pretrained('Xenova/speecht5_tts'); + * + * // Load the models + * // NOTE: We use the unquantized versions as they are more accurate + * const model = await SpeechT5ForTextToSpeech.from_pretrained('Xenova/speecht5_tts', { quantized: false }); + * const vocoder = await SpeechT5HifiGan.from_pretrained('Xenova/speecht5_hifigan', { quantized: false }); + * + * // Load speaker embeddings from URL + * const speaker_embeddings_data = new Float32Array( + * await (await fetch('https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/speaker_embeddings.bin')).arrayBuffer() + * ); + * const speaker_embeddings = new Tensor( + * 'float32', + * speaker_embeddings_data, + * [1, speaker_embeddings_data.length] + * ) + * + * // Run tokenization + * const { input_ids } = tokenizer('Hello, my dog is cute'); + * + * // Generate waveform + * const { waveform } = await model.generate_speech(input_ids, speaker_embeddings, { vocoder }); + * console.log(waveform) + * // Tensor { + * // dims: [ 26112 ], + * // type: 'float32', + * // size: 26112, + * // data: Float32Array(26112) [ -0.00043630177970044315, -0.00018082228780258447, ... ], + * // } + * ``` */ export class SpeechT5ForSpeechToText extends SpeechT5PreTrainedModel { } @@ -3984,6 +4021,8 @@ export class SpeechT5ForTextToSpeech extends SpeechT5PreTrainedModel { /** * HiFi-GAN vocoder. + * + * See [SpeechT5ForSpeechToText](./models#module_models.SpeechT5ForSpeechToText) for example usage. */ export class SpeechT5HifiGan extends PreTrainedModel { main_input_name = 'spectrogram';