271 lines
8.3 KiB
C++
271 lines
8.3 KiB
C++
#include "detectors.h"
|
|
|
|
#include <iostream>
|
|
#include <numeric>
|
|
#include <algorithm>
|
|
#include <cmath>
|
|
#include <limits>
|
|
|
|
using namespace std;
|
|
|
|
#include "popTemplate.h"
|
|
|
|
static const bool kDelayMatch = false;
|
|
|
|
static const int kBlockSize = DETECTORS_BLOCK_SIZE;
|
|
static const int kLogBlockSize = 9;
|
|
static const int kSpectrumSize = kBlockSize/2;
|
|
static const int kWindowSize = kBlockSize;
|
|
|
|
static const int kNumSteps = 4;
|
|
static const int kStepSize = kBlockSize / kNumSteps;
|
|
|
|
static const size_t kMainBandLow = 40;
|
|
static const size_t kMainBandHi = 100;
|
|
static const size_t kOptionalBandHi = 180;
|
|
|
|
static const size_t kLowerBandLow = 3;
|
|
static const size_t kLowerBandHi = kMainBandLow;
|
|
static const size_t kUpperBandLo = kOptionalBandHi;
|
|
static const size_t kUpperBandHi = kSpectrumSize;
|
|
|
|
static const float kDefaultLowPassWeight = 0.6;
|
|
static const int kSpeechShadowTime = 100;
|
|
static const float kSpeechThresh = 0.5;
|
|
|
|
Detectors::Detectors() {
|
|
m_overlapBuffer = new float[kBlockSize * 2];
|
|
|
|
// === Tss Detection
|
|
m_sensitivity = 5.0;
|
|
m_hysterisisFactor = 0.4;
|
|
m_minFrames = 20;
|
|
m_minFramesLong = 100;
|
|
m_lowPassWeight = kDefaultLowPassWeight;
|
|
|
|
// === Pop detection
|
|
m_startBin = 2;
|
|
m_maxShiftDown = 4;
|
|
m_maxShiftUp = 2;
|
|
m_popSensitivity = 8.5;
|
|
m_framesSincePop = 0;
|
|
|
|
// debugLog = new std::ofstream("/Users/tristan/misc/popclick.log");
|
|
|
|
// === FFT
|
|
m_inReal = new float[kBlockSize];
|
|
m_splitData.realp = new float[kSpectrumSize];
|
|
m_splitData.imagp = new float[kSpectrumSize];
|
|
m_window = new float[kWindowSize];
|
|
memset(m_window, 0, sizeof(float) * kWindowSize);
|
|
vDSP_hann_window(m_window, kWindowSize, vDSP_HANN_NORM);
|
|
|
|
m_fftSetup = vDSP_create_fftsetup(kLogBlockSize, FFT_RADIX2);
|
|
}
|
|
|
|
Detectors::~Detectors() {
|
|
delete[] m_overlapBuffer;
|
|
|
|
delete[] m_inReal;
|
|
delete[] m_splitData.realp;
|
|
delete[] m_splitData.imagp;
|
|
delete[] m_window;
|
|
// delete debugLog;
|
|
|
|
vDSP_destroy_fftsetup(m_fftSetup);
|
|
}
|
|
|
|
bool Detectors::initialise() {
|
|
// Real initialisation work goes here!
|
|
m_savedOtherBands = 0.0002;
|
|
m_consecutiveMatches = 0;
|
|
m_framesSinceSpeech = 1000;
|
|
m_framesSinceMatch = 1000;
|
|
m_lowPassBuffer.resize(kSpectrumSize, 0.0);
|
|
|
|
m_spectrum.resize(kSpectrumSize, 0.0);
|
|
m_popBuffer.clear();
|
|
for(unsigned i = 0; i < kBufferSize; ++i) {
|
|
m_popBuffer.push_back(0.0);
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
int Detectors::process(const float *buffer) {
|
|
// return processChunk(buffer);
|
|
// copy last frame to start of the buffer
|
|
std::copy(m_overlapBuffer+kBlockSize, m_overlapBuffer+(kBlockSize*2), m_overlapBuffer);
|
|
// copy new input to the second half of the overlap buffer
|
|
std::copy(buffer,buffer+kBlockSize,m_overlapBuffer+kBlockSize);
|
|
|
|
int result = 0;
|
|
for(int i = 0; i < kNumSteps; ++i) {
|
|
float *ptr = m_overlapBuffer+((i+1)*kStepSize);
|
|
result |= processChunk(ptr);
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void Detectors::doFFT(const float *buffer) {
|
|
vDSP_vmul(buffer, 1, m_window, 1, m_inReal, 1, kBlockSize);
|
|
vDSP_ctoz(reinterpret_cast<DSPComplex*>(m_inReal), 2, &m_splitData, 1, kSpectrumSize);
|
|
vDSP_fft_zrip(m_fftSetup, &m_splitData, 1, kLogBlockSize, FFT_FORWARD);
|
|
m_splitData.imagp[0] = 0.0f;
|
|
|
|
float scale = 1.0f / static_cast<float>(2 * kBlockSize);
|
|
vDSP_vsmul(m_splitData.realp, 1, &scale, m_splitData.realp, 1, kSpectrumSize);
|
|
vDSP_vsmul(m_splitData.imagp, 1, &scale, m_splitData.imagp, 1, kSpectrumSize);
|
|
}
|
|
|
|
int Detectors::processChunk(const float *buffer) {
|
|
doFFT(buffer);
|
|
|
|
int result = 0;
|
|
size_t n = kSpectrumSize;
|
|
|
|
for (size_t i = 0; i < n; ++i) {
|
|
float real = m_splitData.realp[i];
|
|
float imag = m_splitData.imagp[i];
|
|
float newVal = real * real + imag * imag;
|
|
m_spectrum[i] = newVal;
|
|
m_lowPassBuffer[i] = m_lowPassBuffer[i]*(1.0f-m_lowPassWeight) + newVal*m_lowPassWeight;
|
|
|
|
// infinite values happen non-deterministically, probably due to glitchy audio input at start of recording
|
|
// but inifinities it could mess up things forever
|
|
if(m_lowPassBuffer[i] >= numeric_limits<float>::infinity()) {
|
|
std::fill(m_lowPassBuffer.begin(), m_lowPassBuffer.end(), 0.0f);
|
|
return 0; // discard the frame, it's probably garbage
|
|
}
|
|
}
|
|
|
|
float lowerBand = avgBand(m_lowPassBuffer, kLowerBandLow, kLowerBandHi);
|
|
float mainBand = avgBand(m_lowPassBuffer, kMainBandLow, kMainBandHi);
|
|
float upperBand = avgBand(m_lowPassBuffer, kUpperBandLo, kUpperBandHi);
|
|
|
|
m_framesSinceSpeech += 1;
|
|
if(lowerBand > kSpeechThresh) {
|
|
m_framesSinceSpeech = 0;
|
|
}
|
|
|
|
#pragma clang diagnostic push
|
|
#pragma clang diagnostic ignored "-Wunused-but-set-variable"
|
|
float debugMarker = 0.0002;
|
|
#pragma clang diagnostic pop
|
|
float matchiness = mainBand / ((lowerBand+upperBand)/2.0f);
|
|
bool outOfShadow = m_framesSinceSpeech > kSpeechShadowTime;
|
|
int immediateMatchFrame = kDelayMatch ? m_minFramesLong : m_minFrames;
|
|
m_framesSinceMatch += 1;
|
|
if(((matchiness >= m_sensitivity) ||
|
|
(m_consecutiveMatches > 0 && matchiness >= m_sensitivity*m_hysterisisFactor) ||
|
|
(m_consecutiveMatches > immediateMatchFrame && (mainBand/m_savedOtherBands) >= m_sensitivity*m_hysterisisFactor*0.5f))
|
|
&& outOfShadow) {
|
|
debugMarker = 0.01;
|
|
// second one in double "tss" came earlier than trigger timer
|
|
if(kDelayMatch && m_consecutiveMatches == 0 && m_framesSinceMatch <= m_minFramesLong) {
|
|
result |= TSS_START_CODE;
|
|
result |= TSS_STOP_CODE;
|
|
m_framesSinceMatch = 1000;
|
|
}
|
|
|
|
m_consecutiveMatches += 1;
|
|
if(kDelayMatch && m_consecutiveMatches == m_minFrames) {
|
|
m_framesSinceMatch = m_consecutiveMatches;
|
|
} else if(m_consecutiveMatches == immediateMatchFrame) {
|
|
debugMarker = 1.0;
|
|
result |= TSS_START_CODE;
|
|
m_savedOtherBands = ((lowerBand+upperBand)/2.0f);
|
|
}
|
|
} else {
|
|
bool delayedMatch = kDelayMatch && (m_framesSinceMatch == m_minFramesLong && outOfShadow);
|
|
if(delayedMatch) {
|
|
result |= TSS_START_CODE;
|
|
}
|
|
if(m_consecutiveMatches >= immediateMatchFrame || delayedMatch) {
|
|
debugMarker = 2.0;
|
|
result |= TSS_STOP_CODE;
|
|
}
|
|
m_consecutiveMatches = 0;
|
|
}
|
|
|
|
// ===================== Pop Detection =================================
|
|
// update buffer forward one time step
|
|
for(unsigned i = 0; i < kBufferPrimaryHeight; ++i) {
|
|
m_popBuffer.pop_front();
|
|
m_popBuffer.push_back(m_spectrum[i]);
|
|
}
|
|
// high frequencies aren't useful so we bin them all together
|
|
m_popBuffer.pop_front();
|
|
float highSum = accumulate(m_spectrum.begin()+kBufferPrimaryHeight,m_spectrum.end(),0.0);
|
|
m_popBuffer.push_back(highSum);
|
|
|
|
std::deque<float>::iterator maxIt = max_element(m_popBuffer.begin(), m_popBuffer.end());
|
|
float minDiff = 10000000.0;
|
|
for(int i = -m_maxShiftUp; i < m_maxShiftDown; ++i) {
|
|
float diff = templateDiff(*maxIt, i);
|
|
if(diff < minDiff) minDiff = diff;
|
|
}
|
|
|
|
m_framesSincePop += 1;
|
|
if(minDiff < m_popSensitivity && m_framesSincePop > 15) {
|
|
result |= POP_CODE; // Detected pop
|
|
m_framesSincePop = 0;
|
|
}
|
|
|
|
// *debugLog << lowerBand << ' ' << mainBand << ' ' << optionalBand << ' ' << upperBand << '-' << matchiness << ' ' << debugMarker << std::endl;
|
|
return result;
|
|
}
|
|
|
|
float Detectors::avgBand(std::vector<float> &frame, size_t low, size_t hi) {
|
|
float sum = 0;
|
|
for (size_t i = low; i < hi; ++i) {
|
|
sum += frame[i];
|
|
}
|
|
return sum / (hi - low);
|
|
}
|
|
|
|
float Detectors::templateAt(int i, int shift) {
|
|
int bin = i % kBufferHeight;
|
|
if(i % kBufferHeight >= kBufferPrimaryHeight) {
|
|
return kPopTemplate[i]/kPopTemplateMax;
|
|
}
|
|
if(bin+shift < 0 || bin+shift >= kBufferPrimaryHeight) {
|
|
return 0.0;
|
|
}
|
|
return kPopTemplate[i+shift]/kPopTemplateMax;
|
|
}
|
|
|
|
float Detectors::diffCol(int templStart, int bufStart, float maxVal, int shift) {
|
|
float diff = 0;
|
|
for(unsigned i = m_startBin; i < kBufferHeight; ++i) {
|
|
float d = templateAt(templStart+i, shift) - m_popBuffer[bufStart+i]/maxVal;
|
|
diff += abs(d);
|
|
}
|
|
return diff;
|
|
}
|
|
|
|
float Detectors::templateDiff(float maxVal, int shift) {
|
|
float diff = 0;
|
|
for(unsigned i = 0; i < kBufferSize; i += kBufferHeight) {
|
|
diff += diffCol(i,i, maxVal,shift);
|
|
}
|
|
return diff;
|
|
}
|
|
|
|
extern "C" {
|
|
detectors_t *detectors_new() {
|
|
Detectors *dets = new Detectors();
|
|
dets->initialise();
|
|
return reinterpret_cast<detectors_t*>(dets);
|
|
}
|
|
void detectors_free(detectors_t *detectors) {
|
|
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
|
|
delete dets;
|
|
}
|
|
int detectors_process(detectors_t *detectors, const float *buffer) {
|
|
Detectors *dets = reinterpret_cast<Detectors*>(detectors);
|
|
return dets->process(buffer);
|
|
}
|
|
}
|
|
|