422 lines
19 KiB
Python
422 lines
19 KiB
Python
# coding=utf-8
|
|
# Copyright 2021-2023 HuggingFace Inc.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Tests for the SpeechT5 feature extractors."""
|
|
|
|
import itertools
|
|
import random
|
|
import unittest
|
|
|
|
import numpy as np
|
|
|
|
from transformers import BatchFeature, SpeechT5FeatureExtractor
|
|
from transformers.testing_utils import require_torch
|
|
from transformers.utils.import_utils import is_torch_available
|
|
|
|
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
|
|
|
|
|
|
if is_torch_available():
|
|
import torch
|
|
|
|
|
|
global_rng = random.Random()
|
|
|
|
|
|
# Copied from tests.models.whisper.test_feature_extraction_whisper.floats_list
|
|
def floats_list(shape, scale=1.0, rng=None, name=None):
|
|
"""Creates a random float32 tensor"""
|
|
if rng is None:
|
|
rng = global_rng
|
|
|
|
values = []
|
|
for batch_idx in range(shape[0]):
|
|
values.append([])
|
|
for _ in range(shape[1]):
|
|
values[-1].append(rng.random() * scale)
|
|
|
|
return values
|
|
|
|
|
|
@require_torch
|
|
class SpeechT5FeatureExtractionTester(unittest.TestCase):
|
|
def __init__(
|
|
self,
|
|
parent,
|
|
batch_size=7,
|
|
min_seq_length=400,
|
|
max_seq_length=2000,
|
|
feature_size=1,
|
|
padding_value=0.0,
|
|
sampling_rate=16000,
|
|
do_normalize=True,
|
|
num_mel_bins=80,
|
|
hop_length=16,
|
|
win_length=64,
|
|
win_function="hann_window",
|
|
fmin=80,
|
|
fmax=7600,
|
|
mel_floor=1e-10,
|
|
return_attention_mask=True,
|
|
):
|
|
self.parent = parent
|
|
self.batch_size = batch_size
|
|
self.min_seq_length = min_seq_length
|
|
self.max_seq_length = max_seq_length
|
|
self.seq_length_diff = (self.max_seq_length - self.min_seq_length) // (self.batch_size - 1)
|
|
self.feature_size = feature_size
|
|
self.padding_value = padding_value
|
|
self.sampling_rate = sampling_rate
|
|
self.do_normalize = do_normalize
|
|
self.num_mel_bins = num_mel_bins
|
|
self.hop_length = hop_length
|
|
self.win_length = win_length
|
|
self.win_function = win_function
|
|
self.fmin = fmin
|
|
self.fmax = fmax
|
|
self.mel_floor = mel_floor
|
|
self.return_attention_mask = return_attention_mask
|
|
|
|
def prepare_feat_extract_dict(self):
|
|
return {
|
|
"feature_size": self.feature_size,
|
|
"padding_value": self.padding_value,
|
|
"sampling_rate": self.sampling_rate,
|
|
"do_normalize": self.do_normalize,
|
|
"num_mel_bins": self.num_mel_bins,
|
|
"hop_length": self.hop_length,
|
|
"win_length": self.win_length,
|
|
"win_function": self.win_function,
|
|
"fmin": self.fmin,
|
|
"fmax": self.fmax,
|
|
"mel_floor": self.mel_floor,
|
|
"return_attention_mask": self.return_attention_mask,
|
|
}
|
|
|
|
def prepare_inputs_for_common(self, equal_length=False, numpify=False):
|
|
def _flatten(list_of_lists):
|
|
return list(itertools.chain(*list_of_lists))
|
|
|
|
if equal_length:
|
|
speech_inputs = floats_list((self.batch_size, self.max_seq_length))
|
|
else:
|
|
# make sure that inputs increase in size
|
|
speech_inputs = [
|
|
_flatten(floats_list((x, self.feature_size)))
|
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
|
]
|
|
|
|
if numpify:
|
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
|
|
|
return speech_inputs
|
|
|
|
def prepare_inputs_for_target(self, equal_length=False, numpify=False):
|
|
if equal_length:
|
|
speech_inputs = [floats_list((self.max_seq_length, self.num_mel_bins)) for _ in range(self.batch_size)]
|
|
else:
|
|
# make sure that inputs increase in size
|
|
speech_inputs = [
|
|
floats_list((x, self.num_mel_bins))
|
|
for x in range(self.min_seq_length, self.max_seq_length, self.seq_length_diff)
|
|
]
|
|
|
|
if numpify:
|
|
speech_inputs = [np.asarray(x) for x in speech_inputs]
|
|
|
|
return speech_inputs
|
|
|
|
|
|
@require_torch
|
|
class SpeechT5FeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
|
|
feature_extraction_class = SpeechT5FeatureExtractor
|
|
|
|
def setUp(self):
|
|
self.feat_extract_tester = SpeechT5FeatureExtractionTester(self)
|
|
|
|
def _check_zero_mean_unit_variance(self, input_vector):
|
|
self.assertTrue(np.all(np.mean(input_vector, axis=0) < 1e-3))
|
|
self.assertTrue(np.all(np.abs(np.var(input_vector, axis=0) - 1) < 1e-3))
|
|
|
|
def test_call(self):
|
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
# create three inputs of length 800, 1000, and 1200
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
|
|
|
# Test not batched input
|
|
encoded_sequences_1 = feat_extract(speech_inputs[0], return_tensors="np").input_values
|
|
encoded_sequences_2 = feat_extract(np_speech_inputs[0], return_tensors="np").input_values
|
|
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
|
|
|
# Test batched
|
|
encoded_sequences_1 = feat_extract(speech_inputs, return_tensors="np").input_values
|
|
encoded_sequences_2 = feat_extract(np_speech_inputs, return_tensors="np").input_values
|
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
|
|
|
def test_zero_mean_unit_variance_normalization_np(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
|
|
paddings = ["longest", "max_length", "do_not_pad"]
|
|
max_lengths = [None, 1600, None]
|
|
for max_length, padding in zip(max_lengths, paddings):
|
|
processed = feat_extract(speech_inputs, padding=padding, max_length=max_length, return_tensors="np")
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0][:800])
|
|
self.assertTrue(input_values[0][800:].sum() < 1e-6)
|
|
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
|
self.assertTrue(input_values[0][1000:].sum() < 1e-6)
|
|
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
|
|
|
def test_zero_mean_unit_variance_normalization(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
lengths = range(800, 1400, 200)
|
|
speech_inputs = [floats_list((1, x))[0] for x in lengths]
|
|
|
|
paddings = ["longest", "max_length", "do_not_pad"]
|
|
max_lengths = [None, 1600, None]
|
|
|
|
for max_length, padding in zip(max_lengths, paddings):
|
|
processed = feat_extract(speech_inputs, max_length=max_length, padding=padding)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0][:800])
|
|
self._check_zero_mean_unit_variance(input_values[1][:1000])
|
|
self._check_zero_mean_unit_variance(input_values[2][:1200])
|
|
|
|
def test_zero_mean_unit_variance_normalization_trunc_np_max_length(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=1000, padding="max_length", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
def test_zero_mean_unit_variance_normalization_trunc_np_longest(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=1000, padding="longest", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
# make sure that if max_length < longest -> then pad to max_length
|
|
self.assertTrue(input_values.shape == (3, 1000))
|
|
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
processed = feat_extract(
|
|
speech_inputs, truncation=True, max_length=2000, padding="longest", return_tensors="np"
|
|
)
|
|
input_values = processed.input_values
|
|
|
|
self._check_zero_mean_unit_variance(input_values[0, :800])
|
|
self._check_zero_mean_unit_variance(input_values[1, :1000])
|
|
self._check_zero_mean_unit_variance(input_values[2])
|
|
|
|
# make sure that if max_length > longest -> then pad to longest
|
|
self.assertTrue(input_values.shape == (3, 1200))
|
|
|
|
def test_double_precision_pad(self):
|
|
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
np_speech_inputs = np.random.rand(100).astype(np.float64)
|
|
py_speech_inputs = np_speech_inputs.tolist()
|
|
|
|
for inputs in [py_speech_inputs, np_speech_inputs]:
|
|
np_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="np")
|
|
self.assertTrue(np_processed.input_values.dtype == np.float32)
|
|
pt_processed = feature_extractor.pad([{"input_values": inputs}], return_tensors="pt")
|
|
self.assertTrue(pt_processed.input_values.dtype == torch.float32)
|
|
|
|
def test_call_target(self):
|
|
# Tests that all call wrap to encode_plus and batch_encode_plus
|
|
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
|
|
# create three inputs of length 800, 1000, and 1200
|
|
speech_inputs = [floats_list((1, x))[0] for x in range(800, 1400, 200)]
|
|
np_speech_inputs = [np.asarray(speech_input) for speech_input in speech_inputs]
|
|
|
|
# Test feature size
|
|
input_values = feature_extractor(audio_target=np_speech_inputs, padding=True, return_tensors="np").input_values
|
|
self.assertTrue(input_values.ndim == 3)
|
|
self.assertTrue(input_values.shape[-1] == feature_extractor.num_mel_bins)
|
|
|
|
# Test not batched input
|
|
encoded_sequences_1 = feature_extractor(speech_inputs[0], return_tensors="np").input_values
|
|
encoded_sequences_2 = feature_extractor(np_speech_inputs[0], return_tensors="np").input_values
|
|
self.assertTrue(np.allclose(encoded_sequences_1, encoded_sequences_2, atol=1e-3))
|
|
|
|
# Test batched
|
|
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
|
|
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
|
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
|
|
|
# Test 2-D numpy arrays are batched.
|
|
speech_inputs = [floats_list((1, x))[0] for x in (800, 800, 800)]
|
|
np_speech_inputs = np.asarray(speech_inputs)
|
|
encoded_sequences_1 = feature_extractor(speech_inputs, return_tensors="np").input_values
|
|
encoded_sequences_2 = feature_extractor(np_speech_inputs, return_tensors="np").input_values
|
|
for enc_seq_1, enc_seq_2 in zip(encoded_sequences_1, encoded_sequences_2):
|
|
self.assertTrue(np.allclose(enc_seq_1, enc_seq_2, atol=1e-3))
|
|
|
|
def test_batch_feature_target(self):
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs})
|
|
|
|
self.assertTrue(all(len(x) == len(y) for x, y in zip(speech_inputs, processed_features[input_name])))
|
|
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="np")
|
|
|
|
batch_features_input = processed_features[input_name]
|
|
|
|
if len(batch_features_input.shape) < 3:
|
|
batch_features_input = batch_features_input[:, :, None]
|
|
|
|
self.assertTrue(
|
|
batch_features_input.shape
|
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
|
)
|
|
|
|
@require_torch
|
|
def test_batch_feature_target_pt(self):
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target(equal_length=True)
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs}, tensor_type="pt")
|
|
|
|
batch_features_input = processed_features[input_name]
|
|
|
|
if len(batch_features_input.shape) < 3:
|
|
batch_features_input = batch_features_input[:, :, None]
|
|
|
|
self.assertTrue(
|
|
batch_features_input.shape
|
|
== (self.feat_extract_tester.batch_size, len(speech_inputs[0]), self.feat_extract_tester.num_mel_bins)
|
|
)
|
|
|
|
@require_torch
|
|
def test_padding_accepts_tensors_target_pt(self):
|
|
feat_extract = self.feature_extraction_class(**self.feat_extract_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed_features = BatchFeature({input_name: speech_inputs})
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
input_np = feat_extract.pad(processed_features, padding="longest", return_tensors="np")[input_name]
|
|
input_pt = feat_extract.pad(processed_features, padding="longest", return_tensors="pt")[input_name]
|
|
|
|
self.assertTrue(abs(input_np.astype(np.float32).sum() - input_pt.numpy().astype(np.float32).sum()) < 1e-2)
|
|
|
|
def test_attention_mask_target(self):
|
|
feat_dict = self.feat_extract_dict
|
|
feat_dict["return_attention_mask"] = True
|
|
feat_extract = self.feature_extraction_class(**feat_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_lengths = [len(x) for x in speech_inputs]
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed = BatchFeature({input_name: speech_inputs})
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
processed = feat_extract.pad(processed, padding="longest", return_tensors="np")
|
|
self.assertIn("attention_mask", processed)
|
|
self.assertListEqual(list(processed.attention_mask.shape), list(processed[input_name].shape[:2]))
|
|
self.assertListEqual(processed.attention_mask.sum(-1).tolist(), input_lengths)
|
|
|
|
def test_attention_mask_with_truncation_target(self):
|
|
feat_dict = self.feat_extract_dict
|
|
feat_dict["return_attention_mask"] = True
|
|
feat_extract = self.feature_extraction_class(**feat_dict)
|
|
speech_inputs = self.feat_extract_tester.prepare_inputs_for_target()
|
|
input_lengths = [len(x) for x in speech_inputs]
|
|
input_name = feat_extract.model_input_names[0]
|
|
|
|
processed = BatchFeature({input_name: speech_inputs})
|
|
max_length = min(input_lengths)
|
|
|
|
feat_extract.feature_size = feat_extract.num_mel_bins # hack!
|
|
|
|
processed_pad = feat_extract.pad(
|
|
processed, padding="max_length", max_length=max_length, truncation=True, return_tensors="np"
|
|
)
|
|
self.assertIn("attention_mask", processed_pad)
|
|
self.assertListEqual(
|
|
list(processed_pad.attention_mask.shape), [processed_pad[input_name].shape[0], max_length]
|
|
)
|
|
self.assertListEqual(
|
|
processed_pad.attention_mask[:, :max_length].sum(-1).tolist(), [max_length for x in speech_inputs]
|
|
)
|
|
|
|
def _load_datasamples(self, num_samples):
|
|
from datasets import load_dataset
|
|
|
|
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
|
|
# automatic decoding with librispeech
|
|
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
|
|
|
|
return [x["array"] for x in speech_samples]
|
|
|
|
def test_integration(self):
|
|
# fmt: off
|
|
EXPECTED_INPUT_VALUES = torch.tensor(
|
|
[2.3804e-03, 2.0752e-03, 1.9836e-03, 2.1057e-03, 1.6174e-03,
|
|
3.0518e-04, 9.1553e-05, 3.3569e-04, 9.7656e-04, 1.8311e-03,
|
|
2.0142e-03, 2.1057e-03, 1.7395e-03, 4.5776e-04, -3.9673e-04,
|
|
4.5776e-04, 1.0071e-03, 9.1553e-05, 4.8828e-04, 1.1597e-03,
|
|
7.3242e-04, 9.4604e-04, 1.8005e-03, 1.8311e-03, 8.8501e-04,
|
|
4.2725e-04, 4.8828e-04, 7.3242e-04, 1.0986e-03, 2.1057e-03]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(1)
|
|
feature_extractor = SpeechT5FeatureExtractor()
|
|
input_values = feature_extractor(input_speech, return_tensors="pt").input_values
|
|
self.assertEqual(input_values.shape, (1, 93680))
|
|
self.assertTrue(torch.allclose(input_values[0, :30], EXPECTED_INPUT_VALUES, atol=1e-6))
|
|
|
|
def test_integration_target(self):
|
|
# fmt: off
|
|
EXPECTED_INPUT_VALUES = torch.tensor(
|
|
[-2.6870, -3.0104, -3.1356, -3.5352, -3.0044, -3.0353, -3.4719, -3.6777,
|
|
-3.1520, -2.9435, -2.6553, -2.8795, -2.9944, -2.5921, -3.0279, -3.0386,
|
|
-3.0864, -3.1291, -3.2353, -2.7444, -2.6831, -2.7287, -3.1761, -3.1571,
|
|
-3.2726, -3.0582, -3.1007, -3.4533, -3.4695, -3.0998]
|
|
)
|
|
# fmt: on
|
|
|
|
input_speech = self._load_datasamples(1)
|
|
feature_extractor = SpeechT5FeatureExtractor()
|
|
input_values = feature_extractor(audio_target=input_speech, return_tensors="pt").input_values
|
|
self.assertEqual(input_values.shape, (1, 366, 80))
|
|
self.assertTrue(torch.allclose(input_values[0, 0, :30], EXPECTED_INPUT_VALUES, atol=1e-4))
|