[BUG]: write_mp3 Emilia does not save in proper 24k sample rate frequency
Describe the bug
Emilia write_mp3 save in lower bitrate, @khursani8 found the audio frequency been truncated at 8k so we dig,
How To Reproduce
-
You can take any 24k audio sample rate, for example https://github.com/mesolitica/malaya-speech/blob/master/speech/podcast/sg-chunk.mp3
-
Plot the graph,
import librosa
import matplotlib.pyplot as plt
import numpy as np
def plot_waveform_and_mel_spectrogram(waveform, sample_rate):
mel_spec = librosa.feature.melspectrogram(
y=waveform,
sr=sample_rate,
n_mels=80,
n_fft=1024,
hop_length=512
)
mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
plt.figure(figsize=(12, 6))
plt.subplot(2, 1, 1)
librosa.display.waveshow(waveform, sr=sample_rate)
plt.title(f"Waveform")
plt.xlabel("Time (s)")
plt.ylabel("Amplitude")
plt.subplot(2, 1, 2)
librosa.display.specshow(
mel_spec_db,
sr=sample_rate,
hop_length=512,
x_axis="time",
y_axis="mel"
)
plt.colorbar(format="%+2.0f dB")
plt.title("Mel Spectrogram")
plt.xlabel("Time (s)")
plt.ylabel("Mel Frequency")
plt.tight_layout()
plt.show()
You will get,
- But if we use
write_mp3from https://github.com/open-mmlab/Amphion/blob/main/preprocessors/Emilia/utils/tool.py#L52,
from pydub import AudioSegment
def write_mp3(path, sr, x):
"""Convert numpy array to MP3."""
try:
# Ensure x is in the correct format and normalize if necessary
if x.dtype != np.int16:
# Normalize the array to fit in int16 range if it's not already int16
x = np.int16(x / np.max(np.abs(x)) * 32767)
# Create audio segment from numpy array
audio = AudioSegment(
x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
)
# Export as MP3 file
audio.export(path, format="mp3")
except Exception as e:
print(e)
print("Error: Failed to write MP3 file.")
write_mp3('out.mp3', 24000, y)
y, sr = librosa.load('out.mp3', sr = 24000)
plot_waveform_and_mel_spectrogram(y, sr)
You will get,
If you enable debug log level,
DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-f', 'wav', '-i', '/tmp/tmplhbftkl6', '-f', 'mp3', '/tmp/tmpm3ttndt6'])
DEBUG:pydub.converter:subprocess output: b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers'
DEBUG:pydub.converter:subprocess output: b' built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)'
DEBUG:pydub.converter:subprocess output: b' configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared'
DEBUG:pydub.converter:subprocess output: b' libavutil 56. 31.100 / 56. 31.100'
DEBUG:pydub.converter:subprocess output: b' libavcodec 58. 54.100 / 58. 54.100'
DEBUG:pydub.converter:subprocess output: b' libavformat 58. 29.100 / 58. 29.100'
DEBUG:pydub.converter:subprocess output: b' libavdevice 58. 8.100 / 58. 8.100'
DEBUG:pydub.converter:subprocess output: b' libavfilter 7. 57.100 / 7. 57.100'
DEBUG:pydub.converter:subprocess output: b' libavresample 4. 0. 0 / 4. 0. 0'
DEBUG:pydub.converter:subprocess output: b' libswscale 5. 5.100 / 5. 5.100'
DEBUG:pydub.converter:subprocess output: b' libswresample 3. 5.100 / 3. 5.100'
DEBUG:pydub.converter:subprocess output: b' libpostproc 55. 5.100 / 55. 5.100'
DEBUG:pydub.converter:subprocess output: b'Guessed Channel Layout for Input Stream #0.0 : mono'
DEBUG:pydub.converter:subprocess output: b"Input #0, wav, from '/tmp/tmplhbftkl6':"
DEBUG:pydub.converter:subprocess output: b' Duration: 00:00:12.05, bitrate: 384 kb/s'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s'
DEBUG:pydub.converter:subprocess output: b'Stream mapping:'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0 -> #0:0 (pcm_s16le (native) -> mp3 (libmp3lame))'
DEBUG:pydub.converter:subprocess output: b'Press [q] to stop, [?] for help'
DEBUG:pydub.converter:subprocess output: b"Output #0, mp3, to '/tmp/tmpm3ttndt6':"
DEBUG:pydub.converter:subprocess output: b' Metadata:'
DEBUG:pydub.converter:subprocess output: b' TSSE : Lavf58.29.100'
DEBUG:pydub.converter:subprocess output: b' Stream #0:0: Audio: mp3 (libmp3lame), 24000 Hz, mono, s16p'
DEBUG:pydub.converter:subprocess output: b' Metadata:'
DEBUG:pydub.converter:subprocess output: b' encoder : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size= 48kB time=00:00:12.07 bitrate= 32.3kbits/s speed= 371x'
DEBUG:pydub.converter:subprocess output: b'video:0kB audio:47kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.488861%'
DEBUG:matplotlib.colorbar:locator: <matplotlib.ticker.AutoLocator object at 0x7fed11242bc0>
You can see the bitrate for libmp3lame use 32kbit,
DEBUG:pydub.converter:subprocess output: b' encoder : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size= 48kB time=00:00:12.07 bitrate= 32.3kbits/s speed= 371x'
When truncated at 8k frequency, you loss the intended quality of 24k sample rate, supposely at least 12k frequency.
If we set the bitrate to 55k or higher,
from pydub import AudioSegment
def write_mp3(path, sr, x):
"""Convert numpy array to MP3."""
try:
# Ensure x is in the correct format and normalize if necessary
if x.dtype != np.int16:
# Normalize the array to fit in int16 range if it's not already int16
x = np.int16(x / np.max(np.abs(x)) * 32767)
# Create audio segment from numpy array
audio = AudioSegment(
x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
)
# Export as MP3 file
audio.export(path, format="mp3", bitrate='55k')
except Exception as e:
print(e)
print("Error: Failed to write MP3 file.")
We can see now the intended frequency reached 12k,
To be save, we should use soundfile.write.
Environment Information
I tested using 2 different ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers and ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers, both use the same libmp3lame 32kbits.
can confirm, this is my sample file from already process emilia mp3 file
using this code
import librosa
import matplotlib.pyplot as plt
import numpy as np
def check_frequency_range(file_path, expected_sample_rate=24000):
"""
Analyze an audio file to check if it suffers from frequency truncation.
Parameters:
-----------
file_path : str
Path to the audio file to analyze
expected_sample_rate : int
The expected sample rate of the audio (default: 24000)
Returns:
--------
bool
True if the file appears to be affected (truncated at ~8kHz)
"""
# Load the audio file
y, sr = librosa.load(file_path, sr=expected_sample_rate)
# Compute the spectrogram
D = librosa.stft(y)
S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
# Plot the spectrogram
plt.figure(figsize=(12, 6))
librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
plt.colorbar(format='%+2.0f dB')
plt.title(f'Spectrogram of {file_path}')
# Draw a horizontal line at 8kHz to highlight the potential cutoff
plt.axhline(y=8000, color='r', linestyle='--', alpha=0.7)
plt.text(0, 8500, '8kHz - Expected Cutoff for Low Bitrate', color='r')
# Draw a horizontal line at 12kHz (Nyquist for 24kHz audio)
plt.axhline(y=12000, color='g', linestyle='--', alpha=0.7)
plt.text(0, 12500, '12kHz - Expected Coverage for 24kHz Audio', color='g')
plt.tight_layout()
plt.show()
# Calculate energy above 8kHz vs. below 8kHz
freqs = librosa.fft_frequencies(sr=sr)
mask_below = freqs <= 8000
mask_above = (freqs > 8000) & (freqs < 12000)
energy_below = np.sum(np.mean(np.abs(D)[mask_below], axis=1))
energy_above = np.sum(np.mean(np.abs(D)[mask_above], axis=1))
ratio = energy_above / energy_below if energy_below > 0 else 0
print(f"Energy ratio (above 8kHz / below 8kHz): {ratio:.6f}")
print(f"The file {'appears to be affected' if ratio < 0.01 else 'seems to have proper frequency content'}")
return ratio < 0.01 # Return True if likely affected
using ffprobe
{'streams': [{'index': 0, 'codec_name': 'mp3', 'codec_long_name': 'MP3 (MPEG audio layer 3)', 'codec_type': 'audio', 'codec_tag_string': '[0][0][0][0]', 'codec_tag': '0x0000', 'sample_fmt': 'fltp', 'sample_rate': '24000', 'channels': 1, 'channel_layout': 'mono', 'bits_per_sample': 0, 'initial_padding': 0, 'r_frame_rate': '0/0', 'avg_frame_rate': '0/0', 'time_base': '1/14112000', 'start_pts': 649740, 'start_time': '0.046042', 'duration_ts': 226920960, 'duration': '16.080000', 'bit_rate': '32000', 'disposition': {'default': 0, 'dub': 0, 'original': 0, 'comment': 0, 'lyrics': 0, 'karaoke': 0, 'forced': 0, 'hearing_impaired': 0, 'visual_impaired': 0, 'clean_effects': 0, 'attached_pic': 0, 'timed_thumbnails': 0, 'non_diegetic': 0, 'captions': 0, 'descriptions': 0, 'metadata': 0, 'dependent': 0, 'still_image': 0}}], 'format': {'filename': 'enc_indo_cc0_9.mp3', 'nb_streams': 1, 'nb_programs': 0, 'format_name': 'mp3', 'format_long_name': 'MP2/3 (MPEG audio layer 2/3)', 'start_time': '0.046042', 'duration': '16.080000', 'size': '64557', 'bit_rate': '32117', 'probe_score': 51, 'tags': {'encoder': 'Lavf60.16.100'}}}```