LibrosaCpp
LibrosaCpp copied to clipboard
Output mel spectrograms for python and CPP don't match
Python Code
samples = np.full(16_000 * 5, 0.2)
mel_spec = librosa.feature.melspectrogram(y=samples, sr=16_000, n_fft=1024, hop_length=512, n_mels=128)
print('shape: ', mel_spec.shape)
print('sum: ', mel_spec.sum())
print('min: ', mel_spec.min())
print('max: ', mel_spec.max())
Python Output
shape: (128, 157)
sum: 11806.58392107923
min: 7.865755998623591e-34
max: 89.1440147017634
CPP Code
int sr = 16000;
int n_fft = 1024;
int n_hop = 512;
int n_mel = 128;
int fmin = 0.0;
int fmax = sr / 2.0;
std::vector<float> x(sr * 5, 0.2);
std::vector<std::vector<float>> mels = librosa::Feature::melspectrogram(x, sr, n_fft, n_hop, "hann", true,
"reflect",
2.f, n_mel, fmin, fmax);
std::cout << "shape: [" << mels.size() << "," << mels[0].size() << "]" << std::endl;
double sums = 0;
float maxi = -INT_MAX, mini = INT_MAX;
for(auto &arr: mels) {
for(auto k: arr) {
sums += k;
maxi = max(maxi, k);
mini = min(mini, k);
}
}
cout << "sums: " << sums << "\n";
cout << "mini: " << mini << "\n";
cout << "maxi: " << maxi << "\n";
CPP output
shape: [157,128]
sums: 11761.6
mini: 5.23758e-16
maxi: 74.9149
I have the same issue, the outputs don't match.
Can you tell me some resources to study basics of audio signal processing like concepts of STFT, MelSpectrogram, etc(code oriented).
Python Code
samples = np.full(16_000 * 5, 0.2) mel_spec = librosa.feature.melspectrogram(y=samples, sr=16_000, n_fft=1024, hop_length=512, n_mels=128) print('shape: ', mel_spec.shape) print('sum: ', mel_spec.sum()) print('min: ', mel_spec.min()) print('max: ', mel_spec.max())
Python Output
shape: (128, 157) sum: 11806.58392107923 min: 7.865755998623591e-34 max: 89.1440147017634
CPP Code
int sr = 16000; int n_fft = 1024; int n_hop = 512; int n_mel = 128; int fmin = 0.0; int fmax = sr / 2.0; std::vector<float> x(sr * 5, 0.2); std::vector<std::vector<float>> mels = librosa::Feature::melspectrogram(x, sr, n_fft, n_hop, "hann", true, "reflect", 2.f, n_mel, fmin, fmax); std::cout << "shape: [" << mels.size() << "," << mels[0].size() << "]" << std::endl; double sums = 0; float maxi = -INT_MAX, mini = INT_MAX; for(auto &arr: mels) { for(auto k: arr) { sums += k; maxi = max(maxi, k); mini = min(mini, k); } } cout << "sums: " << sums << "\n"; cout << "mini: " << mini << "\n"; cout << "maxi: " << maxi << "\n";
CPP output
shape: [157,128] sums: 11761.6 mini: 5.23758e-16 maxi: 74.9149
I found the same problem, did you solve it?
setting pad_mode = "reflect" in python code like this:
samples = np.full(16000 * 5, 0.2)
mel_spec = librosa.feature.melspectrogram(y=samples, sr=16000, n_fft=1024, hop_length=512, n_mels=128,pad_mode='reflect')
print(mel_spec.T)
print('shape: ', mel_spec.shape)
print('sum: ', mel_spec.sum())
print('min: ', mel_spec.min())
print('max: ', mel_spec.max())
python output:
shape: (128, 157)
sum: 11761.64503417969
min: 1.1692227111050498e-33
max: 74.91493652343752