essentia
essentia copied to clipboard
Music Extractor creating layout differences in json output
I'm running MusicExtractor on a dataset and outputting a json per each audio file. I want to run convert.py on the json files to put them into a single csv for training.
It fails because there are layout differences in the json files. Why are there differences in the json layout if all files are being passed through the same function defined MusicExtractor? How can I force it to preserve a single layout?
This is the function def:
def extractMusicFeatures(filename):
pool = Pool()
pool, pool_frames = MusicExtractor(lowlevelStats=['mean', 'stdev'],
rhythmStats=['mean', 'stdev'],
tonalStats=['mean', 'stdev'])(filename)
#define the output method
aggPool = PoolAggregator()(pool)
YamlOutput(filename = filename[:-4]+ '_' + "features.json", format = "json", doubleCheck = True)(aggPool)
Some look like this:
"rhythm": {
"beats_count": 0,
"beats_loudness": {
"dmean": 0,
"dmean2": 0,
"dvar": 0,
"dvar2": 0,
"max": 0,
"mean": 0,
"median": 0,
"min": 0,
"stdev": 0,
"var": 0
},
"bpm": 0,
"bpm_histogram_first_peak_bpm": 0,
"bpm_histogram_first_peak_weight": 0,
"bpm_histogram_second_peak_bpm": 0,
"bpm_histogram_second_peak_spread": 0,
"bpm_histogram_second_peak_weight": 0,
"danceability": 0.621241211891,
"onset_rate": 0.742524206638,
"beats_loudness_band_ratio": {
"dmean": [0, 0, 0, 0, 0, 0],
"dmean2": [0, 0, 0, 0, 0, 0],
"dvar": [0, 0, 0, 0, 0, 0],
"dvar2": [0, 0, 0, 0, 0, 0],
"max": [0, 0, 0, 0, 0, 0],
"mean": [0, 0, 0, 0, 0, 0],
"median": [0, 0, 0, 0, 0, 0],
"min": [0, 0, 0, 0, 0, 0],
"stdev": [0, 0, 0, 0, 0, 0],
"var": [0, 0, 0, 0, 0, 0]
},
"beats_position": [],
"bpm_histogram": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
},
Others look like this:
"rhythm": {
"beats_count": 4,
"beats_loudness": {
"dmean": 0.0185536239296,
"dmean2": 0.038089312613,
"dvar": 0.000109063053969,
"dvar2": 0.000166488738614,
"max": 0.0749037861824,
"mean": 0.0535248741508,
"median": 0.0494076050818,
"min": 0.0403805039823,
"stdev": 0.0132451010868,
"var": 0.000175432694959
},
"beats_loudness_band_ratio": {
"dmean": {
"max": 0.0365547351539,
"mean": 0.016866132617,
"median": 0.0171192958951,
"min": 4.02092227887e-05,
"stdev": 0.0128420386463
},
"dmean2": {
"max": 0.0598655045033,
"mean": 0.0246758162975,
"median": 0.0194989442825,
"min": 5.76653728785e-05,
"stdev": 0.0225874632597
},
"dvar": {
"max": 9.30139431148e-05,
"mean": 4.22792836616e-05,
"median": 4.94886844535e-05,
"min": 2.77952383332e-10,
"stdev": 3.33894677169e-05
},
"dvar2": {
"max": 0.00114206771832,
"mean": 0.000231039521168,
"median": 4.82477771584e-05,
"min": 8.30940871666e-10,
"stdev": 0.000410665263189
},
"max": {
"max": 0.601244449615,
"mean": 0.187209561467,
"median": 0.0754443407059,
"min": 0.000183830386959,
"stdev": 0.220703974366
},
"mean": {
"max": 0.569649934769,
"mean": 0.173336461186,
"median": 0.0585699938238,
"min": 0.000137929571792,
"stdev": 0.212452918291
},
"median": {
"max": 0.57282012701,
"mean": 0.175628885627,
"median": 0.0615110397339,
"min": 0.000140584685141,
"stdev": 0.213430136442
},
"min": {
"max": 0.531715095043,
"mean": 0.154878526926,
"median": 0.0358135439456,
"min": 8.67185517563e-05,
"stdev": 0.20249080658
},
"stdev": {
"max": 0.024795403704,
"mean": 0.0120693258941,
"median": 0.0138675682247,
"min": 3.71553796867e-05,
"stdev": 0.00831406842917
},
"var": {
"max": 0.000614812073763,
"mean": 0.000214792336919,
"median": 0.000192369450815,
"min": 1.38052214105e-09,
"stdev": 0.000205521646421
}
},
"bpm": 128.14881897,
"bpm_histogram_first_peak_bpm": 129,
"bpm_histogram_first_peak_weight": 0.333333313465,
"bpm_histogram_second_peak_bpm": 0,
"bpm_histogram_second_peak_spread": 0,
"bpm_histogram_second_peak_weight": 0,
"danceability": 0.823930025101,
"onset_rate": 0.441706717014,
"beats_position": [0.452789098024, 0.917188167572, 1.39319729805, 1.8575963974],
"bpm_histogram": [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.333333343267, 0, 0, 0.666666686535, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
},
I don't expect them to produce sensible results as the audio files are single hits only a second long. I'll remove what I don't need later in feature selection. But I do need all MusicExtractor outputs to follow the same layout at this stage.
Are you specifically referring to the different items in beats_loudness_band_ratio
? Everything else looks the same to me
This looks like a bug which happens when the values are 0: https://github.com/MTG/essentia/blob/ba79be6515f2fd0cde75ee3f6fa98706a66f4c36/src/algorithms/extractor/musicextractor.cpp#L360