Amphion icon indicating copy to clipboard operation
Amphion copied to clipboard

[BUG]: write_mp3 Emilia does not save in proper 24k sample rate frequency

Open huseinzol05 opened this issue 8 months ago • 1 comments

Describe the bug

Emilia write_mp3 save in lower bitrate, @khursani8 found the audio frequency been truncated at 8k so we dig,

How To Reproduce

  1. You can take any 24k audio sample rate, for example https://github.com/mesolitica/malaya-speech/blob/master/speech/podcast/sg-chunk.mp3

  2. Plot the graph,

import librosa
import matplotlib.pyplot as plt
import numpy as np
        
def plot_waveform_and_mel_spectrogram(waveform, sample_rate):
    mel_spec = librosa.feature.melspectrogram(
        y=waveform,
        sr=sample_rate,
        n_mels=80,
        n_fft=1024,
        hop_length=512
    )
    mel_spec_db = librosa.power_to_db(mel_spec, ref=np.max)
    plt.figure(figsize=(12, 6))
    plt.subplot(2, 1, 1)
    librosa.display.waveshow(waveform, sr=sample_rate)
    plt.title(f"Waveform")
    plt.xlabel("Time (s)")
    plt.ylabel("Amplitude")
    plt.subplot(2, 1, 2)
    librosa.display.specshow(
        mel_spec_db,
        sr=sample_rate,
        hop_length=512,
        x_axis="time",
        y_axis="mel"
    )
    plt.colorbar(format="%+2.0f dB")
    plt.title("Mel Spectrogram")
    plt.xlabel("Time (s)")
    plt.ylabel("Mel Frequency")

    plt.tight_layout()
    plt.show()

You will get,

Image

  1. But if we use write_mp3 from https://github.com/open-mmlab/Amphion/blob/main/preprocessors/Emilia/utils/tool.py#L52,
from pydub import AudioSegment

def write_mp3(path, sr, x):
    """Convert numpy array to MP3."""
    try:
        # Ensure x is in the correct format and normalize if necessary
        if x.dtype != np.int16:
            # Normalize the array to fit in int16 range if it's not already int16
            x = np.int16(x / np.max(np.abs(x)) * 32767)

        # Create audio segment from numpy array
        audio = AudioSegment(
            x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
        )
        # Export as MP3 file
        audio.export(path, format="mp3")
    except Exception as e:
        print(e)
        print("Error: Failed to write MP3 file.")
write_mp3('out.mp3', 24000, y)
y, sr = librosa.load('out.mp3', sr = 24000)
plot_waveform_and_mel_spectrogram(y, sr)

You will get,

Image

If you enable debug log level,

DEBUG:pydub.converter:subprocess.call(['ffmpeg', '-y', '-f', 'wav', '-i', '/tmp/tmplhbftkl6', '-f', 'mp3', '/tmp/tmpm3ttndt6'])
DEBUG:pydub.converter:subprocess output: b'ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers'
DEBUG:pydub.converter:subprocess output: b'  built with gcc 9 (Ubuntu 9.4.0-1ubuntu1~20.04.1)'
DEBUG:pydub.converter:subprocess output: b'  configuration: --prefix=/usr --extra-version=0ubuntu0.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-avresample --disable-filter=resample --enable-avisynth --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librsvg --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enable-libwavpack --enable-libwebp --enable-libx265 --enable-libxml2 --enable-libxvid --enable-libzmq --enable-libzvbi --enable-lv2 --enable-omx --enable-openal --enable-opencl --enable-opengl --enable-sdl2 --enable-libdc1394 --enable-libdrm --enable-libiec61883 --enable-nvenc --enable-chromaprint --enable-frei0r --enable-libx264 --enable-shared'
DEBUG:pydub.converter:subprocess output: b'  libavutil      56. 31.100 / 56. 31.100'
DEBUG:pydub.converter:subprocess output: b'  libavcodec     58. 54.100 / 58. 54.100'
DEBUG:pydub.converter:subprocess output: b'  libavformat    58. 29.100 / 58. 29.100'
DEBUG:pydub.converter:subprocess output: b'  libavdevice    58.  8.100 / 58.  8.100'
DEBUG:pydub.converter:subprocess output: b'  libavfilter     7. 57.100 /  7. 57.100'
DEBUG:pydub.converter:subprocess output: b'  libavresample   4.  0.  0 /  4.  0.  0'
DEBUG:pydub.converter:subprocess output: b'  libswscale      5.  5.100 /  5.  5.100'
DEBUG:pydub.converter:subprocess output: b'  libswresample   3.  5.100 /  3.  5.100'
DEBUG:pydub.converter:subprocess output: b'  libpostproc    55.  5.100 / 55.  5.100'
DEBUG:pydub.converter:subprocess output: b'Guessed Channel Layout for Input Stream #0.0 : mono'
DEBUG:pydub.converter:subprocess output: b"Input #0, wav, from '/tmp/tmplhbftkl6':"
DEBUG:pydub.converter:subprocess output: b'  Duration: 00:00:12.05, bitrate: 384 kb/s'
DEBUG:pydub.converter:subprocess output: b'    Stream #0:0: Audio: pcm_s16le ([1][0][0][0] / 0x0001), 24000 Hz, mono, s16, 384 kb/s'
DEBUG:pydub.converter:subprocess output: b'Stream mapping:'
DEBUG:pydub.converter:subprocess output: b'  Stream #0:0 -> #0:0 (pcm_s16le (native) -> mp3 (libmp3lame))'
DEBUG:pydub.converter:subprocess output: b'Press [q] to stop, [?] for help'
DEBUG:pydub.converter:subprocess output: b"Output #0, mp3, to '/tmp/tmpm3ttndt6':"
DEBUG:pydub.converter:subprocess output: b'  Metadata:'
DEBUG:pydub.converter:subprocess output: b'    TSSE            : Lavf58.29.100'
DEBUG:pydub.converter:subprocess output: b'    Stream #0:0: Audio: mp3 (libmp3lame), 24000 Hz, mono, s16p'
DEBUG:pydub.converter:subprocess output: b'    Metadata:'
DEBUG:pydub.converter:subprocess output: b'      encoder         : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size=      48kB time=00:00:12.07 bitrate=  32.3kbits/s speed= 371x'
DEBUG:pydub.converter:subprocess output: b'video:0kB audio:47kB subtitle:0kB other streams:0kB global headers:0kB muxing overhead: 0.488861%'
DEBUG:matplotlib.colorbar:locator: <matplotlib.ticker.AutoLocator object at 0x7fed11242bc0>

You can see the bitrate for libmp3lame use 32kbit,

DEBUG:pydub.converter:subprocess output: b'      encoder         : Lavc58.54.100 libmp3lame'
DEBUG:pydub.converter:subprocess output: b'size=      48kB time=00:00:12.07 bitrate=  32.3kbits/s speed= 371x'

When truncated at 8k frequency, you loss the intended quality of 24k sample rate, supposely at least 12k frequency.

If we set the bitrate to 55k or higher,

from pydub import AudioSegment

def write_mp3(path, sr, x):
    """Convert numpy array to MP3."""
    try:
        # Ensure x is in the correct format and normalize if necessary
        if x.dtype != np.int16:
            # Normalize the array to fit in int16 range if it's not already int16
            x = np.int16(x / np.max(np.abs(x)) * 32767)

        # Create audio segment from numpy array
        audio = AudioSegment(
            x.tobytes(), frame_rate=sr, sample_width=x.dtype.itemsize, channels=1
        )
        # Export as MP3 file
        audio.export(path, format="mp3", bitrate='55k')
    except Exception as e:
        print(e)
        print("Error: Failed to write MP3 file.")

We can see now the intended frequency reached 12k,

Image

To be save, we should use soundfile.write.

Environment Information

I tested using 2 different ffmpeg version 4.2.7-0ubuntu0.1 Copyright (c) 2000-2022 the FFmpeg developers and ffmpeg version 6.1.1-3ubuntu5 Copyright (c) 2000-2023 the FFmpeg developers, both use the same libmp3lame 32kbits.

huseinzol05 avatar May 06 '25 05:05 huseinzol05

can confirm, this is my sample file from already process emilia mp3 file

using this code

import librosa
import matplotlib.pyplot as plt
import numpy as np

def check_frequency_range(file_path, expected_sample_rate=24000):
    """
    Analyze an audio file to check if it suffers from frequency truncation.
    
    Parameters:
    -----------
    file_path : str
        Path to the audio file to analyze
    expected_sample_rate : int
        The expected sample rate of the audio (default: 24000)
        
    Returns:
    --------
    bool
        True if the file appears to be affected (truncated at ~8kHz)
    """
    # Load the audio file
    y, sr = librosa.load(file_path, sr=expected_sample_rate)
    
    # Compute the spectrogram
    D = librosa.stft(y)
    S_db = librosa.amplitude_to_db(np.abs(D), ref=np.max)
    
    # Plot the spectrogram
    plt.figure(figsize=(12, 6))
    librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f'Spectrogram of {file_path}')
    
    # Draw a horizontal line at 8kHz to highlight the potential cutoff
    plt.axhline(y=8000, color='r', linestyle='--', alpha=0.7)
    plt.text(0, 8500, '8kHz - Expected Cutoff for Low Bitrate', color='r')
    
    # Draw a horizontal line at 12kHz (Nyquist for 24kHz audio)
    plt.axhline(y=12000, color='g', linestyle='--', alpha=0.7)
    plt.text(0, 12500, '12kHz - Expected Coverage for 24kHz Audio', color='g')
    
    plt.tight_layout()
    plt.show()
    
    # Calculate energy above 8kHz vs. below 8kHz
    freqs = librosa.fft_frequencies(sr=sr)
    mask_below = freqs <= 8000
    mask_above = (freqs > 8000) & (freqs < 12000)
    
    energy_below = np.sum(np.mean(np.abs(D)[mask_below], axis=1))
    energy_above = np.sum(np.mean(np.abs(D)[mask_above], axis=1))
    
    ratio = energy_above / energy_below if energy_below > 0 else 0
    
    print(f"Energy ratio (above 8kHz / below 8kHz): {ratio:.6f}")
    print(f"The file {'appears to be affected' if ratio < 0.01 else 'seems to have proper frequency content'}")
    
    return ratio < 0.01  # Return True if likely affected
Image

using ffprobe

{'streams': [{'index': 0, 'codec_name': 'mp3', 'codec_long_name': 'MP3 (MPEG audio layer 3)', 'codec_type': 'audio', 'codec_tag_string': '[0][0][0][0]', 'codec_tag': '0x0000', 'sample_fmt': 'fltp', 'sample_rate': '24000', 'channels': 1, 'channel_layout': 'mono', 'bits_per_sample': 0, 'initial_padding': 0, 'r_frame_rate': '0/0', 'avg_frame_rate': '0/0', 'time_base': '1/14112000', 'start_pts': 649740, 'start_time': '0.046042', 'duration_ts': 226920960, 'duration': '16.080000', 'bit_rate': '32000', 'disposition': {'default': 0, 'dub': 0, 'original': 0, 'comment': 0, 'lyrics': 0, 'karaoke': 0, 'forced': 0, 'hearing_impaired': 0, 'visual_impaired': 0, 'clean_effects': 0, 'attached_pic': 0, 'timed_thumbnails': 0, 'non_diegetic': 0, 'captions': 0, 'descriptions': 0, 'metadata': 0, 'dependent': 0, 'still_image': 0}}], 'format': {'filename': 'enc_indo_cc0_9.mp3', 'nb_streams': 1, 'nb_programs': 0, 'format_name': 'mp3', 'format_long_name': 'MP2/3 (MPEG audio layer 2/3)', 'start_time': '0.046042', 'duration': '16.080000', 'size': '64557', 'bit_rate': '32117', 'probe_score': 51, 'tags': {'encoder': 'Lavf60.16.100'}}}```

acul3 avatar May 08 '25 18:05 acul3