essentia.js
essentia.js copied to clipboard
PitchContoursMultiMelody
Hello,
I'm trying to use the PitchContoursMultiMelody algorithm to implement real-time multi pitch detection. I'm following the recommended processing chain mentioned in the description of the PitchContours algorithm, which seems to work fine, but after I pass the outputs to the PitchContoursMultiMelody the audio worklet seems to freeze either immediately or after a couple of seconds (no errors are thrown). I'm unsure why this happens and would appreciate any help.
Note that I do try to explicitly delete the C++ objects every time the worklet process runs, as suggested by the Emscripten documentation (in case that's a memory issue), but this doesn't seem to make any difference (I don't have much experience with Emscripten so I might have got that wrong).
See my code below:
import { EssentiaModule } from '/essentia-wasm.module.js'
import Essentia from '/essentia.js-core.es.js'
class AnalyserProcessor extends AudioWorkletProcessor {
constructor() {
super()
this.essentia = new Essentia(EssentiaModule)
this.initFrames()
this.initAggregate()
}
initFrames() {
this.frames = []
for (let i = 0; i < 16; i++) {
this.frames[i] = {
array: new Float32Array(2048),
offset: -i - 1,
}
}
}
initAggregate() {
this.aggregate = {
bins: new this.essentia.module.VectorVectorFloat(),
saliences: new this.essentia.module.VectorVectorFloat(),
length: 0,
}
}
process(inputs, outputs, parameters) {
const essentia = this.essentia
// assume mono
const input = inputs[0][0]
this.frames.forEach(frame => {
frame.offset += 1
if (frame.offset >= 0) {
frame.offset %= 16
frame.array.set(input, frame.offset * 128)
if (frame.offset == 15) {
// start processing chain
const signal = essentia.arrayToVector(frame.array)
const eqloud = essentia.EqualLoudness(signal).signal
const window = essentia.Windowing(
eqloud, true, 2048, 'hann', 4, true
).frame
const spectrum = essentia.Spectrum(window).spectrum
const spectralPeaks = essentia.SpectralPeaks(spectrum)
// check sizes before converting to arrays
if (
spectralPeaks.frequencies.size()
&& spectralPeaks.magnitudes.size()
) {
// check peaks before salience function
const freq = essentia.vectorToArray(spectralPeaks.frequencies)
const mag = essentia.vectorToArray(spectralPeaks.magnitudes)
if (!freq.some(f => f <= 0) && !mag.some(m => m < 0)) {
// pitch salience
const salienceFunction = essentia.PitchSalienceFunction(
...Object.values(spectralPeaks)
).salienceFunction
const salienceFunctionPeaks =
essentia.PitchSalienceFunctionPeaks(salienceFunction)
this.aggregate.bins.push_back(
salienceFunctionPeaks.salienceBins
)
this.aggregate.saliences.push_back(
salienceFunctionPeaks.salienceValues
)
this.aggregate.length++
if (this.aggregate.length == 128) {
const pitchContours = this.essentia.PitchContours(
this.aggregate.bins, this.aggregate.saliences
)
console.log('PitchContours', pitchContours)
const pitchContoursMultiMelody =
this
.essentia
.PitchContoursMultiMelody(...Object.values(pitchContours))
.pitch
console.log('MultiMelody', pitchContoursMultiMelody)
// delete objects ?
pitchContoursMultiMelody.delete()
pitchContours.contoursStartTimes.delete()
pitchContours.contoursSaliences.delete()
pitchContours.contoursBins.delete()
this.aggregate.bins.delete()
this.aggregate.saliences.delete()
this.initAggregate()
}
salienceFunctionPeaks.salienceValues.delete()
salienceFunctionPeaks.salienceBins.delete()
salienceFunction.delete()
}
}
spectralPeaks.frequencies.delete()
spectralPeaks.magnitudes.delete()
spectrum.delete()
window.delete()
eqloud.delete()
signal.delete()
}
}
})
return true
}
}
registerProcessor('AnalyserProcessor', AnalyserProcessor)
PitchContours* melody algorithms aren't designed for real-time, because they require a segment of audio to gather pitch contours and apply statistics. You can compute the chain up to PitchSalienceFunctionPeaks in real-time though.
For real-time Pitch detection we have a number of PitchYin* algorithms, but those are suited for monophonic signals.
Thanks for the reply!
What is the minimum duration of the required audio segment?
In the code above I'm trying to implement a hybrid approach, so it's not really 'real-time'. The salience function peaks are calculated in real time using a frame size of 2048 and a hop size of 128 as suggested in the recommended processing chain. But I then store the resulting bins and saliences in two VectorVectorFloat objects (in this.aggregate), and wait to have 128 of those before passing them to the PitchContours and PitchContourMultiMelody algorithms (then, I reset this.aggregate and repeat).
The PitchContours algorithm seems to work fine and generates output. The chain seems to also work well if I replace PitchContoursMultiMelody with PitchContoursMonoMelody. Now I've tried to increase the size of the aggregate from 128 to 256. This works for a bit longer, but also eventually freezes.. The same happens if I increase it to 512 - it works for some time but eventually freezes.
We'll have to look at the freezes. @albincorreya
If the contours look fine, you can try working on shorter segments and see.