From acc53dd8b155ed12405fe7d4dcebc65a4639c3ee Mon Sep 17 00:00:00 2001 From: Joshua Lochner Date: Tue, 16 May 2023 22:28:38 +0200 Subject: [PATCH] Improve `getMelFilters` function Minor performance +memory improvements (~20%) --- src/utils/audio.js | 93 +++++++++++++++++++++++++--------------------- src/utils/maths.js | 12 +++--- 2 files changed, 58 insertions(+), 47 deletions(-) diff --git a/src/utils/audio.js b/src/utils/audio.js index 5cba3fa..9b06c56 100644 --- a/src/utils/audio.js +++ b/src/utils/audio.js @@ -64,60 +64,69 @@ export async function read_audio(url, sampling_rate) { } /** - * @param {number} sr - The sampling rate. - * @param {number} n_fft - * @param {number} n_mels - * @param {Float32ArrayConstructor | Float64ArrayConstructor} dtype The data type. - * @returns + * Creates a frequency bin conversion matrix used to obtain a mel spectrogram. + * @param {number} sr Sample rate of the audio waveform. + * @param {number} n_fft Number of frequencies used to compute the spectrogram (should be the same as in `stft`). + * @param {number} n_mels Number of mel filters to generate. + * @returns {number[][]} Projection matrix to go from a spectrogram to a mel spectrogram. */ - export function getMelFilters(sr, n_fft, n_mels = 128, dtype = Float32Array) { - // Initialize the weights +export function getMelFilters(sr, n_fft, n_mels = 128) { n_mels = Math.floor(n_mels); - let weights = new Array(n_mels).fill(0).map(() => new dtype(Math.floor(1 + n_fft / 2))); + + // Initialize the weights + const mel_size = Math.floor(1 + n_fft / 2); + const weights = new Array(n_mels); // Center freqs of each FFT bin - let fftfreqs = rfftfreq(n_fft, 1 / sr); + const fftfreqs = rfftfreq(n_fft, 1 / sr); // 'Center freqs' of mel bands - uniformly spaced between limits - let min_mel = 0.0; - let max_mel = 45.245640471924965; - - let mels = Array.from({ length: n_mels + 2 }, (v, k) => k * ((max_mel - min_mel) / (n_mels + 1)) + min_mel); + const min_mel = 0.0; + const max_mel = 45.245640471924965; + const mel_range = max_mel - min_mel; + const mel_scale = mel_range / (n_mels + 1); // Fill in the linear scale - let f_min = 0.0; - let f_sp = 200.0 / 3; - let freqs = mels.map(v => f_min + f_sp * v); + const f_min = 0.0; + const f_sp = 200.0 / 3; + const freqs = new Array(n_mels + 2); // And now the nonlinear scale - let min_log_hz = 1000.0; // beginning of log region (Hz) - let min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels) - let logstep = Math.log(6.4) / 27.0; // step size for log region + const min_log_hz = 1000.0; // beginning of log region (Hz) + const min_log_mel = (min_log_hz - f_min) / f_sp; // same (Mels) + const logstep = Math.log(6.4) / 27.0; // step size for log region - // If we have vector data, vectorize - let log_t = mels.map(v => v >= min_log_mel); - freqs.forEach((v, i) => { - if (log_t[i]) { - freqs[i] = min_log_hz * Math.exp(logstep * (mels[i] - min_log_mel)); - } - }) - - let mel_f = freqs; - - let fdiff = mel_f.slice(1).map((v, i) => v - mel_f[i]); - let ramps = mel_f.map(v => fftfreqs.map(k => v - k)); - - for (let i = 0; i < n_mels; i++) { - // lower and upper slopes for all bins - let lower = ramps[i].map(v => -v / fdiff[i]); - let upper = ramps[i + 2].map(v => v / fdiff[i + 1]); - - // .. then intersect them with each other and zero - weights[i] = lower.map((v, j) => Math.max(0, Math.min(v, upper[j]))); + const ramps = new Array(freqs.length); + for (let i = 0; i < freqs.length; ++i) { + const mel = i * mel_scale + min_mel; + if (mel >= min_log_mel) { + freqs[i] = min_log_hz * Math.exp(logstep * (mel - min_log_mel)); + } else { + freqs[i] = f_min + f_sp * mel; + } + ramps[i] = fftfreqs.map(k => freqs[i] - k); + } + + const fdiffinv = freqs.slice(1).map((v, i) => 1 / (v - freqs[i])); + + for (let i = 0; i < weights.length; ++i) { + weights[i] = new Array(mel_size); + + const a = fdiffinv[i]; + const b = fdiffinv[i + 1]; + const c = ramps[i]; + const d = ramps[i + 2]; + + // Slaney-style mel is scaled to be approx constant energy per channel + const enorm = 2.0 / (freqs[i + 2] - freqs[i]); + + for (let j = 0; j < weights[i].length; ++j) { + // lower and upper slopes for all bins + const lower = -c[j] * a; + const upper = d[j] * b; + weights[i][j] = Math.max(0, Math.min(lower, upper)) * enorm; + } } - // Slaney-style mel is scaled to be approx constant energy per channel - let enorm = mel_f.slice(2, n_mels + 2).map((v, i) => 2.0 / (v - mel_f[i])); - weights = weights.map((v, i) => v.map(k => k * enorm[i])); return weights; } diff --git a/src/utils/maths.js b/src/utils/maths.js index bdc785f..869d18c 100644 --- a/src/utils/maths.js +++ b/src/utils/maths.js @@ -270,24 +270,26 @@ export function max(arr) { } /** + * Return the Discrete Fourier Transform sample frequencies. + * * Code adapted from https://github.com/numpy/numpy/blob/25908cacd19915bf3ddd659c28be28a41bd97a54/numpy/fft/helper.py#L173-L221 - * Original Python doc: Original Python doc: https://numpy.org/doc/stable/reference/generated/numpy.fft.rfftfreq.html + * Original Python doc: https://numpy.org/doc/stable/reference/generated/numpy.fft.rfftfreq.html * @example * rfftfreq(400, 1 / 16000) // (201) [0, 40, 80, 120, 160, 200, ..., 8000] * @param {number} n Window length * @param {number} [d = 1.0] Sample spacing (inverse of the sampling rate). Defaults to 1. - * @throws {TypeError} + * @throws {TypeError} If n is not an integer. * @returns {number[]} Array of length `Math.floor(n / 2) + 1;` containing the sample frequencies. */ - export function rfftfreq(n, d = 1.0) { +export function rfftfreq(n, d = 1.0) { if (!Number.isInteger(n)) { throw new TypeError(`n should be an integer, but ${n} given.`); } const val = 1.0 / (n * d); const len = Math.floor(n / 2) + 1; const results = new Array(len); - for (let i = 0; i < len; i++) { - results[i] = i * val; + for (let i = 0; i < len; ++i) { + results[i] = i * val; } return results; }