transformers/tests/models/univnet/test_modeling_univnet.py

344 lines
13 KiB
Python

# Copyright 2023 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import inspect
import random
import unittest
from datasets import Audio, load_dataset
from transformers import UnivNetConfig, UnivNetFeatureExtractor
from transformers.testing_utils import (
is_torch_available,
require_torch,
require_torch_gpu,
slow,
torch_device,
)
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
ModelTesterMixin,
floats_tensor,
)
if is_torch_available():
import torch
from transformers import UnivNetModel
class UnivNetModelTester:
def __init__(
self,
parent,
batch_size=2,
seq_length=7,
in_channels=8,
hidden_channels=8,
num_mel_bins=20,
kernel_predictor_hidden_channels=8,
seed=0,
is_training=False,
):
self.parent = parent
self.batch_size = batch_size
self.seq_length = seq_length
self.in_channels = in_channels
self.hidden_channels = hidden_channels
self.num_mel_bins = num_mel_bins
self.kernel_predictor_hidden_channels = kernel_predictor_hidden_channels
self.seed = seed
self.is_training = is_training
def prepare_noise_sequence(self):
generator = torch.manual_seed(self.seed)
noise_shape = (self.batch_size, self.seq_length, self.in_channels)
# Create noise on CPU for reproducibility
noise_sequence = torch.randn(noise_shape, generator=generator, dtype=torch.float)
return noise_sequence
def prepare_config_and_inputs(self):
spectrogram = floats_tensor([self.batch_size, self.seq_length, self.num_mel_bins], scale=1.0)
noise_sequence = self.prepare_noise_sequence()
noise_sequence = noise_sequence.to(spectrogram.device)
config = self.get_config()
return config, spectrogram, noise_sequence
def get_config(self):
return UnivNetConfig(
model_in_channels=self.in_channels,
model_hidden_channels=self.hidden_channels,
num_mel_bins=self.num_mel_bins,
kernel_predictor_hidden_channels=self.kernel_predictor_hidden_channels,
)
def create_and_check_model(self, config, spectrogram, noise_sequence):
model = UnivNetModel(config=config).to(torch_device).eval()
result = model(spectrogram, noise_sequence)[0]
self.parent.assertEqual(result.shape, (self.batch_size, self.seq_length * 256))
def prepare_config_and_inputs_for_common(self):
config, spectrogram, noise_sequence = self.prepare_config_and_inputs()
inputs_dict = {"input_features": spectrogram, "noise_sequence": noise_sequence}
return config, inputs_dict
@require_torch
class UnivNetModelTest(ModelTesterMixin, unittest.TestCase):
all_model_classes = (UnivNetModel,) if is_torch_available() else ()
# UnivNetModel currently cannot be traced with torch.jit.trace.
test_torchscript = False
# The UnivNetModel is not a transformer and does not use any attention mechanisms, so skip transformer/attention
# related tests.
test_pruning = False
test_resize_embeddings = False
test_resize_position_embeddings = False
test_head_masking = False
# UnivNetModel is not a sequence classification model.
test_mismatched_shapes = False
# UnivNetModel does not have a base_model_prefix attribute.
test_missing_keys = False
# UnivNetModel does not implement a parallelize method.
test_model_parallel = False
is_encoder_decoder = False
has_attentions = False
input_name = "input_features"
def setUp(self):
self.model_tester = UnivNetModelTester(self)
self.config_tester = ConfigTester(self, config_class=UnivNetConfig)
@unittest.skip(reason="fix this once it gets more usage")
def test_multi_gpu_data_parallel_forward(self):
super().test_multi_gpu_data_parallel_forward()
def test_config(self):
self.config_tester.create_and_test_config_to_json_string()
self.config_tester.create_and_test_config_to_json_file()
self.config_tester.create_and_test_config_from_and_save_pretrained()
self.config_tester.create_and_test_config_from_and_save_pretrained_subfolder()
self.config_tester.create_and_test_config_with_num_labels()
self.config_tester.check_config_can_be_init_without_params()
self.config_tester.check_config_arguments_init()
def test_model(self):
config_and_inputs = self.model_tester.prepare_config_and_inputs()
self.model_tester.create_and_check_model(*config_and_inputs)
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
signature = inspect.signature(model.forward)
# signature.parameters is an OrderedDict => so arg_names order is deterministic
arg_names = [*signature.parameters.keys()]
expected_arg_names = [
"input_features",
]
self.assertListEqual(arg_names[: len(expected_arg_names)], expected_arg_names)
@unittest.skip(reason="UnivNetModel does not output hidden_states.")
def test_hidden_states_output(self):
pass
@unittest.skip(reason="UnivNetModel.forward does not accept an inputs_embeds argument.")
def test_inputs_embeds(self):
pass
@unittest.skip(reason="UnivNetModel does not use input embeddings and thus has no get_input_embeddings method.")
def test_model_common_attributes(self):
pass
@unittest.skip(reason="UnivNetModel does not support all arguments tested, such as output_hidden_states.")
def test_model_outputs_equivalence(self):
pass
@unittest.skip(reason="UnivNetModel does not output hidden_states.")
def test_retain_grad_hidden_states_attentions(self):
pass
def test_batched_inputs_outputs(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
batched_spectrogram = inputs["input_features"]
batched_noise_sequence = inputs["noise_sequence"]
with torch.no_grad():
batched_outputs = model(
batched_spectrogram.to(torch_device),
batched_noise_sequence.to(torch_device),
)[0]
self.assertEqual(
batched_spectrogram.shape[0],
batched_outputs.shape[0],
msg="Got different batch dims for input and output",
)
def test_unbatched_inputs_outputs(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs = model(
inputs["input_features"][:1].to(torch_device), inputs["noise_sequence"][:1].to(torch_device)
)[0]
self.assertTrue(outputs.shape[0] == 1, msg="Unbatched input should create batched output with bsz = 1")
@require_torch_gpu
@slow
class UnivNetModelIntegrationTests(unittest.TestCase):
def tearDown(self):
super().tearDown()
gc.collect()
torch.cuda.empty_cache()
def _load_datasamples(self, num_samples, sampling_rate=24000):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
ds = ds.cast_column("audio", Audio(sampling_rate=sampling_rate))
# automatic decoding with librispeech
speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
return [x["array"] for x in speech_samples], [x["sampling_rate"] for x in speech_samples]
def get_inputs(self, device, num_samples: int = 3, noise_length: int = 10, seed: int = 0):
generator = torch.manual_seed(seed)
# Note: hardcode model_in_channels -> 64
if num_samples == 1:
noise_sequence_shape = (64, noise_length)
else:
noise_sequence_shape = (num_samples, 64, noise_length)
# Explicity generate noise_sequence on CPU for consistency.
noise_sequence = torch.randn(noise_sequence_shape, generator=generator, dtype=torch.float32, device="cpu")
# Put noise_sequence on the desired device.
noise_sequence = noise_sequence.to(device)
# Note: hardcode num_mel_channels -> 100
if num_samples == 1:
spectrogram_shape = [100, noise_length]
else:
spectrogram_shape = [num_samples, 100, noise_length]
spectrogram = floats_tensor(spectrogram_shape, scale=1.0, rng=random.Random(seed))
# Note: spectrogram should already be on torch_device
# Permute to match diffusers implementation
if num_samples == 1:
noise_sequence = noise_sequence.transpose(1, 0)
spectrogram = spectrogram.transpose(1, 0)
else:
noise_sequence = noise_sequence.transpose(2, 1)
spectrogram = spectrogram.transpose(2, 1)
inputs = {
"input_features": spectrogram,
"noise_sequence": noise_sequence,
"generator": generator,
}
return inputs
def test_model_inference_batched(self):
# Load sample checkpoint from Tortoise TTS
model = UnivNetModel.from_pretrained("dg845/univnet-dev")
model.eval().to(torch_device)
# Get batched noise and spectrogram inputs.
input_speech = self.get_inputs(torch_device, num_samples=3)
with torch.no_grad():
waveform = model(**input_speech)[0]
waveform = waveform.cpu()
waveform_mean = torch.mean(waveform)
waveform_stddev = torch.std(waveform)
waveform_slice = waveform[-1, -9:].flatten()
EXPECTED_MEAN = torch.tensor(-0.19989729)
EXPECTED_STDDEV = torch.tensor(0.35230172)
EXPECTED_SLICE = torch.tensor([-0.3408, -0.6045, -0.5052, 0.1160, -0.1556, -0.0405, -0.3024, -0.5290, -0.5019])
torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=1e-4, rtol=1e-5)
torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=5e-4, rtol=1e-5)
def test_model_inference_unbatched(self):
# Load sample checkpoint from Tortoise TTS
model = UnivNetModel.from_pretrained("dg845/univnet-dev")
model.eval().to(torch_device)
# Get unbatched noise and spectrogram inputs.
input_speech = self.get_inputs(torch_device, num_samples=1)
with torch.no_grad():
waveform = model(**input_speech)[0]
waveform = waveform.cpu()
waveform_mean = torch.mean(waveform)
waveform_stddev = torch.std(waveform)
waveform_slice = waveform[-1, -9:].flatten()
EXPECTED_MEAN = torch.tensor(-0.22895093)
EXPECTED_STDDEV = torch.tensor(0.33986747)
EXPECTED_SLICE = torch.tensor([-0.3276, -0.5504, -0.3484, 0.3574, -0.0373, -0.1826, -0.4880, -0.6431, -0.5162])
torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=1e-4, rtol=1e-5)
torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=1e-3, rtol=1e-5)
def test_integration(self):
feature_extractor = UnivNetFeatureExtractor.from_pretrained("dg845/univnet-dev")
model = UnivNetModel.from_pretrained("dg845/univnet-dev")
model.eval().to(torch_device)
audio, sr = self._load_datasamples(1, sampling_rate=feature_extractor.sampling_rate)
input_features = feature_extractor(audio, sampling_rate=sr[0], return_tensors="pt").input_features
input_features = input_features.to(device=torch_device)
input_speech = self.get_inputs(torch_device, num_samples=1, noise_length=input_features.shape[1])
input_speech["input_features"] = input_features
with torch.no_grad():
waveform = model(**input_speech)[0]
waveform = waveform.cpu()
waveform_mean = torch.mean(waveform)
waveform_stddev = torch.std(waveform)
waveform_slice = waveform[-1, -9:].flatten()
EXPECTED_MEAN = torch.tensor(0.00051374)
EXPECTED_STDDEV = torch.tensor(0.058105603)
# fmt: off
EXPECTED_SLICE = torch.tensor([-4.3934e-04, -1.8203e-04, -3.3033e-04, -3.8716e-04, -1.6125e-04, 3.5389e-06, -3.3149e-04, -3.7613e-04, -2.3331e-04])
# fmt: on
torch.testing.assert_close(waveform_mean, EXPECTED_MEAN, atol=5e-6, rtol=1e-5)
torch.testing.assert_close(waveform_stddev, EXPECTED_STDDEV, atol=1e-4, rtol=1e-5)
torch.testing.assert_close(waveform_slice, EXPECTED_SLICE, atol=5e-6, rtol=1e-5)