hammerspoon/extensions/noises/detectors.cpp

271 lines
8.3 KiB
C++

#include "detectors.h"
#include <iostream>
#include <numeric>
#include <algorithm>
#include <cmath>
#include <limits>
using namespace std;
#include "popTemplate.h"
static const bool kDelayMatch = false;
static const int kBlockSize = DETECTORS_BLOCK_SIZE;
static const int kLogBlockSize = 9;
static const int kSpectrumSize = kBlockSize/2;
static const int kWindowSize = kBlockSize;
static const int kNumSteps = 4;
static const int kStepSize = kBlockSize / kNumSteps;
static const size_t kMainBandLow = 40;
static const size_t kMainBandHi = 100;
static const size_t kOptionalBandHi = 180;
static const size_t kLowerBandLow = 3;
static const size_t kLowerBandHi = kMainBandLow;
static const size_t kUpperBandLo = kOptionalBandHi;
static const size_t kUpperBandHi = kSpectrumSize;
static const float kDefaultLowPassWeight = 0.6;
static const int kSpeechShadowTime = 100;
static const float kSpeechThresh = 0.5;
Detectors::Detectors() {
m_overlapBuffer = new float[kBlockSize * 2];
// === Tss Detection
m_sensitivity = 5.0;
m_hysterisisFactor = 0.4;
m_minFrames = 20;
m_minFramesLong = 100;
m_lowPassWeight = kDefaultLowPassWeight;
// === Pop detection
m_startBin = 2;
m_maxShiftDown = 4;
m_maxShiftUp = 2;
m_popSensitivity = 8.5;
m_framesSincePop = 0;
// debugLog = new std::ofstream("/Users/tristan/misc/popclick.log");
// === FFT
m_inReal = new float[kBlockSize];
m_splitData.realp = new float[kSpectrumSize];
m_splitData.imagp = new float[kSpectrumSize];
m_window = new float[kWindowSize];
memset(m_window, 0, sizeof(float) * kWindowSize);
vDSP_hann_window(m_window, kWindowSize, vDSP_HANN_NORM);
m_fftSetup = vDSP_create_fftsetup(kLogBlockSize, FFT_RADIX2);
}
Detectors::~Detectors() {
delete[] m_overlapBuffer;
delete[] m_inReal;
delete[] m_splitData.realp;
delete[] m_splitData.imagp;
delete[] m_window;
// delete debugLog;
vDSP_destroy_fftsetup(m_fftSetup);
}
bool Detectors::initialise() {
// Real initialisation work goes here!
m_savedOtherBands = 0.0002;
m_consecutiveMatches = 0;
m_framesSinceSpeech = 1000;
m_framesSinceMatch = 1000;
m_lowPassBuffer.resize(kSpectrumSize, 0.0);
m_spectrum.resize(kSpectrumSize, 0.0);
m_popBuffer.clear();
for(unsigned i = 0; i < kBufferSize; ++i) {
m_popBuffer.push_back(0.0);
}
return true;
}
int Detectors::process(const float *buffer) {
// return processChunk(buffer);
// copy last frame to start of the buffer
std::copy(m_overlapBuffer+kBlockSize, m_overlapBuffer+(kBlockSize*2), m_overlapBuffer);
// copy new input to the second half of the overlap buffer
std::copy(buffer,buffer+kBlockSize,m_overlapBuffer+kBlockSize);
int result = 0;
for(int i = 0; i < kNumSteps; ++i) {
float *ptr = m_overlapBuffer+((i+1)*kStepSize);
result |= processChunk(ptr);
}
return result;
}
void Detectors::doFFT(const float *buffer) {
vDSP_vmul(buffer, 1, m_window, 1, m_inReal, 1, kBlockSize);
vDSP_ctoz(reinterpret_cast<DSPComplex*>(m_inReal), 2, &m_splitData, 1, kSpectrumSize);
vDSP_fft_zrip(m_fftSetup, &m_splitData, 1, kLogBlockSize, FFT_FORWARD);
m_splitData.imagp[0] = 0.0f;
float scale = 1.0f / static_cast<float>(2 * kBlockSize);
vDSP_vsmul(m_splitData.realp, 1, &scale, m_splitData.realp, 1, kSpectrumSize);
vDSP_vsmul(m_splitData.imagp, 1, &scale, m_splitData.imagp, 1, kSpectrumSize);
}
int Detectors::processChunk(const float *buffer) {
doFFT(buffer);
int result = 0;
size_t n = kSpectrumSize;
for (size_t i = 0; i < n; ++i) {
float real = m_splitData.realp[i];
float imag = m_splitData.imagp[i];
float newVal = real * real + imag * imag;
m_spectrum[i] = newVal;
m_lowPassBuffer[i] = m_lowPassBuffer[i]*(1.0f-m_lowPassWeight) + newVal*m_lowPassWeight;
// infinite values happen non-deterministically, probably due to glitchy audio input at start of recording
// but inifinities it could mess up things forever
if(m_lowPassBuffer[i] >= numeric_limits<float>::infinity()) {
std::fill(m_lowPassBuffer.begin(), m_lowPassBuffer.end(), 0.0f);
return 0; // discard the frame, it's probably garbage
}
}
float lowerBand = avgBand(m_lowPassBuffer, kLowerBandLow, kLowerBandHi);
float mainBand = avgBand(m_lowPassBuffer, kMainBandLow, kMainBandHi);
float upperBand = avgBand(m_lowPassBuffer, kUpperBandLo, kUpperBandHi);
m_framesSinceSpeech += 1;
if(lowerBand > kSpeechThresh) {
m_framesSinceSpeech = 0;
}
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
float debugMarker = 0.0002;
#pragma clang diagnostic pop
float matchiness = mainBand / ((lowerBand+upperBand)/2.0f);
bool outOfShadow = m_framesSinceSpeech > kSpeechShadowTime;
int immediateMatchFrame = kDelayMatch ? m_minFramesLong : m_minFrames;
m_framesSinceMatch += 1;
if(((matchiness >= m_sensitivity) ||
(m_consecutiveMatches > 0 && matchiness >= m_sensitivity*m_hysterisisFactor) ||
(m_consecutiveMatches > immediateMatchFrame && (mainBand/m_savedOtherBands) >= m_sensitivity*m_hysterisisFactor*0.5f))
&& outOfShadow) {
debugMarker = 0.01;
// second one in double "tss" came earlier than trigger timer
if(kDelayMatch && m_consecutiveMatches == 0 && m_framesSinceMatch <= m_minFramesLong) {
result |= TSS_START_CODE;
result |= TSS_STOP_CODE;
m_framesSinceMatch = 1000;
}
m_consecutiveMatches += 1;
if(kDelayMatch && m_consecutiveMatches == m_minFrames) {
m_framesSinceMatch = m_consecutiveMatches;
} else if(m_consecutiveMatches == immediateMatchFrame) {
debugMarker = 1.0;
result |= TSS_START_CODE;
m_savedOtherBands = ((lowerBand+upperBand)/2.0f);
}
} else {
bool delayedMatch = kDelayMatch && (m_framesSinceMatch == m_minFramesLong && outOfShadow);
if(delayedMatch) {
result |= TSS_START_CODE;
}
if(m_consecutiveMatches >= immediateMatchFrame || delayedMatch) {
debugMarker = 2.0;
result |= TSS_STOP_CODE;
}
m_consecutiveMatches = 0;
}
// ===================== Pop Detection =================================
// update buffer forward one time step
for(unsigned i = 0; i < kBufferPrimaryHeight; ++i) {
m_popBuffer.pop_front();
m_popBuffer.push_back(m_spectrum[i]);
}
// high frequencies aren't useful so we bin them all together
m_popBuffer.pop_front();
float highSum = accumulate(m_spectrum.begin()+kBufferPrimaryHeight,m_spectrum.end(),0.0);
m_popBuffer.push_back(highSum);
std::deque<float>::iterator maxIt = max_element(m_popBuffer.begin(), m_popBuffer.end());
float minDiff = 10000000.0;
for(int i = -m_maxShiftUp; i < m_maxShiftDown; ++i) {
float diff = templateDiff(*maxIt, i);
if(diff < minDiff) minDiff = diff;
}
m_framesSincePop += 1;
if(minDiff < m_popSensitivity && m_framesSincePop > 15) {
result |= POP_CODE; // Detected pop
m_framesSincePop = 0;
}
// *debugLog << lowerBand << ' ' << mainBand << ' ' << optionalBand << ' ' << upperBand << '-' << matchiness << ' ' << debugMarker << std::endl;
return result;
}
float Detectors::avgBand(std::vector<float> &frame, size_t low, size_t hi) {
float sum = 0;
for (size_t i = low; i < hi; ++i) {
sum += frame[i];
}
return sum / (hi - low);
}
float Detectors::templateAt(int i, int shift) {
int bin = i % kBufferHeight;
if(i % kBufferHeight >= kBufferPrimaryHeight) {
return kPopTemplate[i]/kPopTemplateMax;
}
if(bin+shift < 0 || bin+shift >= kBufferPrimaryHeight) {
return 0.0;
}
return kPopTemplate[i+shift]/kPopTemplateMax;
}
float Detectors::diffCol(int templStart, int bufStart, float maxVal, int shift) {
float diff = 0;
for(unsigned i = m_startBin; i < kBufferHeight; ++i) {
float d = templateAt(templStart+i, shift) - m_popBuffer[bufStart+i]/maxVal;
diff += abs(d);
}
return diff;
}
float Detectors::templateDiff(float maxVal, int shift) {
float diff = 0;
for(unsigned i = 0; i < kBufferSize; i += kBufferHeight) {
diff += diffCol(i,i, maxVal,shift);
}
return diff;
}
extern "C" {
detectors_t *detectors_new() {
Detectors *dets = new Detectors();
dets->initialise();
return reinterpret_cast<detectors_t*>(dets);
}
void detectors_free(detectors_t *detectors) {
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
delete dets;
}
int detectors_process(detectors_t *detectors, const float *buffer) {
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
return dets->process(buffer);
}
}