transformers/tests/models/pop2piano/test_feature_extraction_pop...

292 lines
12 KiB
Python

# coding=utf-8
# Copyright 2023 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import unittest
import numpy as np
from datasets import load_dataset
from transformers.testing_utils import (
check_json_file_has_correct_format,
require_essentia,
require_librosa,
require_scipy,
require_tf,
require_torch,
)
from transformers.utils.import_utils import (
is_essentia_available,
is_librosa_available,
is_scipy_available,
is_torch_available,
)
from ...test_sequence_feature_extraction_common import SequenceFeatureExtractionTestMixin
requirements_available = (
is_torch_available() and is_essentia_available() and is_scipy_available() and is_librosa_available()
)
if requirements_available:
import torch
from transformers import Pop2PianoFeatureExtractor
class Pop2PianoFeatureExtractionTester(unittest.TestCase):
def __init__(
self,
parent,
n_bars=2,
sample_rate=22050,
use_mel=True,
padding_value=0,
vocab_size_special=4,
vocab_size_note=128,
vocab_size_velocity=2,
vocab_size_time=100,
):
self.parent = parent
self.n_bars = n_bars
self.sample_rate = sample_rate
self.use_mel = use_mel
self.padding_value = padding_value
self.vocab_size_special = vocab_size_special
self.vocab_size_note = vocab_size_note
self.vocab_size_velocity = vocab_size_velocity
self.vocab_size_time = vocab_size_time
def prepare_feat_extract_dict(self):
return {
"n_bars": self.n_bars,
"sample_rate": self.sample_rate,
"use_mel": self.use_mel,
"padding_value": self.padding_value,
"vocab_size_special": self.vocab_size_special,
"vocab_size_note": self.vocab_size_note,
"vocab_size_velocity": self.vocab_size_velocity,
"vocab_size_time": self.vocab_size_time,
}
@require_torch
@require_essentia
@require_librosa
@require_scipy
class Pop2PianoFeatureExtractionTest(SequenceFeatureExtractionTestMixin, unittest.TestCase):
feature_extraction_class = Pop2PianoFeatureExtractor if requirements_available else None
def setUp(self):
self.feat_extract_tester = Pop2PianoFeatureExtractionTester(self)
def test_feat_extract_from_and_save_pretrained(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
saved_file = feat_extract_first.save_pretrained(tmpdirname)[0]
check_json_file_has_correct_format(saved_file)
feat_extract_second = self.feature_extraction_class.from_pretrained(tmpdirname)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.use_mel
mel_2 = feat_extract_second.use_mel
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_feat_extract_to_json_file(self):
feat_extract_first = self.feature_extraction_class(**self.feat_extract_dict)
with tempfile.TemporaryDirectory() as tmpdirname:
json_file_path = os.path.join(tmpdirname, "feat_extract.json")
feat_extract_first.to_json_file(json_file_path)
feat_extract_second = self.feature_extraction_class.from_json_file(json_file_path)
dict_first = feat_extract_first.to_dict()
dict_second = feat_extract_second.to_dict()
mel_1 = feat_extract_first.use_mel
mel_2 = feat_extract_second.use_mel
self.assertTrue(np.allclose(mel_1, mel_2))
self.assertEqual(dict_first, dict_second)
def test_call(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input = np.zeros([1000000], dtype=np.float32)
input_features = feature_extractor(speech_input, sampling_rate=16_000, return_tensors="np")
self.assertTrue(input_features.input_features.ndim == 3)
self.assertEqual(input_features.input_features.shape[-1], 512)
self.assertTrue(input_features.beatsteps.ndim == 2)
self.assertTrue(input_features.extrapolated_beatstep.ndim == 2)
def test_integration(self):
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
speech_samples = ds.sort("id").select([0])["audio"]
input_speech = [x["array"] for x in speech_samples][0]
sampling_rate = [x["sampling_rate"] for x in speech_samples][0]
feaure_extractor = Pop2PianoFeatureExtractor.from_pretrained("sweetcocoa/pop2piano")
input_features = feaure_extractor(
input_speech, sampling_rate=sampling_rate, return_tensors="pt"
).input_features
EXPECTED_INPUT_FEATURES = torch.tensor(
[[-7.1493, -6.8701, -4.3214], [-5.9473, -5.7548, -3.8438], [-6.1324, -5.9018, -4.3778]]
)
self.assertTrue(torch.allclose(input_features[0, :3, :3], EXPECTED_INPUT_FEATURES, atol=1e-4))
def test_attention_mask(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2],
sampling_rate=[44_100, 16_000],
return_tensors="np",
return_attention_mask=True,
)
self.assertTrue(hasattr(input_features, "attention_mask"))
# check shapes
self.assertTrue(input_features["attention_mask"].ndim == 2)
self.assertEqual(input_features["attention_mask_beatsteps"].shape[0], 2)
self.assertEqual(input_features["attention_mask_extrapolated_beatstep"].shape[0], 2)
# check if they are any values except 0 and 1
self.assertTrue(np.max(input_features["attention_mask"]) == 1)
self.assertTrue(np.max(input_features["attention_mask_beatsteps"]) == 1)
self.assertTrue(np.max(input_features["attention_mask_extrapolated_beatstep"]) == 1)
self.assertTrue(np.min(input_features["attention_mask"]) == 0)
self.assertTrue(np.min(input_features["attention_mask_beatsteps"]) == 0)
self.assertTrue(np.min(input_features["attention_mask_extrapolated_beatstep"]) == 0)
def test_batch_feature(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_attention_mask=True,
)
self.assertEqual(len(input_features["input_features"].shape), 3)
# check shape
self.assertEqual(input_features["beatsteps"].shape[0], 3)
self.assertEqual(input_features["extrapolated_beatstep"].shape[0], 3)
def test_batch_feature_np(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_tensors="np",
return_attention_mask=True,
)
# check np array or not
self.assertEqual(type(input_features["input_features"]), np.ndarray)
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
def test_batch_feature_pt(self):
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_tensors="pt",
return_attention_mask=True,
)
# check pt tensor or not
self.assertEqual(type(input_features["input_features"]), torch.Tensor)
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
@require_tf
def test_batch_feature_tf(self):
import tensorflow as tf
feature_extractor = self.feature_extraction_class(**self.feat_extract_tester.prepare_feat_extract_dict())
speech_input1 = np.zeros([1_000_000], dtype=np.float32)
speech_input2 = np.ones([2_000_000], dtype=np.float32)
speech_input3 = np.random.randint(low=0, high=10, size=500_000).astype(np.float32)
input_features = feature_extractor(
[speech_input1, speech_input2, speech_input3],
sampling_rate=[44_100, 16_000, 48_000],
return_tensors="tf",
return_attention_mask=True,
)
# check tf tensor or not
self.assertTrue(tf.is_tensor(input_features["input_features"]))
# check shape
self.assertEqual(len(input_features["input_features"].shape), 3)
@unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
)
def test_padding_accepts_tensors_pt(self):
pass
@unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
)
def test_padding_accepts_tensors_tf(self):
pass
@unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
)
def test_padding_from_list(self):
pass
@unittest.skip(
"Pop2PianoFeatureExtractor does not supports padding externally (while processing audios in batches padding is automatically applied to max_length)"
)
def test_padding_from_array(self):
pass
@unittest.skip("Pop2PianoFeatureExtractor does not support truncation")
def test_attention_mask_with_truncation(self):
pass
@unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
def test_truncation_from_array(self):
pass
@unittest.skip("Pop2PianoFeatureExtractor does not supports truncation")
def test_truncation_from_list(self):
pass