DeepFilterNet
DeepFilterNet copied to clipboard
What does "_ll_" mean in the ONNX tarballs?
What's the difference between these two files?
- DeepFilterNet3_ll_onnx.tar.gz
- DeepFilterNet3_onnx.tar.gz
ll is larger (35 MB vs. 7.7 MB) and it seems to run slower as a baseline when feeding it mostly silence until I speak, when it runs as fast.
What does LL mean - low latency? Is it supposed to be faster or slower? What part of the paper talks about this?
When I inspect their config.ini, I notice ll does have 0 lookahead (df_lookahead and conv_lookahead), which does indicate "low latency"
On the other hand, LL variant has larger weights/more layers in a few places which can make it run slower?
How did you run with these two models. I tried to make an example in python to run them. But it has some "ValueError: Required inputs (['e3', 'e2', 'e1', 'e0']) are missing from input feed (['emb']).".
import onnxruntime as ort import numpy as np import soundfile as sf from scipy import signal
def load_model(model_path): """Load the ONNX model.""" session = ort.InferenceSession(model_path) return session
def load_audio(audio_path, target_sample_rate=48000): """Load and preprocess the audio file.""" audio, sample_rate = sf.read(audio_path)
# Ensure the audio is mono
if len(audio.shape) > 1:
audio = np.mean(audio, axis=1) # Convert stereo to mono
# Resample to the target sample rate (e.g., 48 kHz)
if sample_rate != target_sample_rate:
audio = signal.resample(audio, int(len(audio) * target_sample_rate / sample_rate))
sample_rate = target_sample_rate
# Normalize the audio to the range [-1, 1]
audio = audio.astype(np.float32)
audio /= np.max(np.abs(audio))
return audio, sample_rate
def split_audio(audio, chunk_size=16000): """ Split the audio into fixed-length chunks. If the audio length is not a multiple of chunk_size, pad the last chunk with zeros. """ num_chunks = int(np.ceil(len(audio) / chunk_size)) chunks = []
for i in range(num_chunks):
start = i * chunk_size
end = start + chunk_size
chunk = audio[start:end]
# Pad the last chunk with zeros if necessary
if len(chunk) < chunk_size:
chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode="constant")
chunks.append(chunk)
return chunks
def process_audio(model, audio, chunk_size=16000): """Process the audio using the ONNX model.""" # Split the audio into chunks chunks = split_audio(audio, chunk_size)
# Process each chunk
processed_chunks = []
for chunk in chunks:
# Prepare the input tensor
input_data = np.expand_dims(chunk, axis=0) # Add batch dimension
input_data = np.expand_dims(input_data, axis=0) # Add channel dimension
# Get input and output names
input_name = model.get_inputs()[0].name
output_name = model.get_outputs()[0].name
# Run the model
output = model.run([output_name], {input_name: input_data})[0]
# Postprocess the output
output_audio = np.squeeze(output) # Remove batch and channel dimensions
processed_chunks.append(output_audio)
# Combine the processed chunks
processed_audio = np.concatenate(processed_chunks)
return processed_audio
def save_audio(audio, output_path, sample_rate): """Save the processed audio to a WAV file.""" sf.write(output_path, audio, sample_rate)
def main(model_path, audio_path, output_path, chunk_size=16000): """Main function to process a WAV file using the ONNX model.""" # Load the model model = load_model(model_path)
# Load and preprocess the audio
audio, sample_rate = load_audio(audio_path)
# Process the audio
processed_audio = process_audio(model, audio, chunk_size)
# Save the processed audio
save_audio(processed_audio, output_path, sample_rate)
print(f"Processed audio saved to {output_path}")
Example usage
model_path = "./model/deepFilter/DeepFilterNet2_onnx/export/erb_dec.onnx" # Path to the ONNX model audio_path = "./test_audio/test1.wav" # Path to the input WAV file output_path = "./output.wav" # Path to save the processed audio
main(model_path, audio_path, output_path)
I used the Rust code/libDF, not the Python code, so I can't help you, sorry.
Thanks for your reply. May I ask whether those onnx models pre-trained models i.e., you can directly use them to do the noise suppression tasks?
The architecture is a bit more complicated. There are 3 sub-models in the onnx tarball as you've seen:
tmp/export/enc.onnx
tmp/export/erb_dec.onnx
tmp/export/df_dec.onnx
All 3 work together to implement the denoising task, with code in between that does the spectral transform/FFT/STFT or something, turns it into ERB bands, so much pre-processing and intra-processing.
It's going to be hard to directly use them without implementing all of those steps.
How did you run with these two models. I tried to make an example in python to run them. But it has some "ValueError: Required inputs (['e3', 'e2', 'e1', 'e0']) are missing from input feed (['emb']).".
import onnxruntime as ort import numpy as np import soundfile as sf from scipy import signal
def load_model(model_path): """Load the ONNX model.""" session = ort.InferenceSession(model_path) return session
def load_audio(audio_path, target_sample_rate=48000): """Load and preprocess the audio file.""" audio, sample_rate = sf.read(audio_path)
# Ensure the audio is mono if len(audio.shape) > 1: audio = np.mean(audio, axis=1) # Convert stereo to mono # Resample to the target sample rate (e.g., 48 kHz) if sample_rate != target_sample_rate: audio = signal.resample(audio, int(len(audio) * target_sample_rate / sample_rate)) sample_rate = target_sample_rate # Normalize the audio to the range [-1, 1] audio = audio.astype(np.float32) audio /= np.max(np.abs(audio)) return audio, sample_ratedef split_audio(audio, chunk_size=16000): """ Split the audio into fixed-length chunks. If the audio length is not a multiple of chunk_size, pad the last chunk with zeros. """ num_chunks = int(np.ceil(len(audio) / chunk_size)) chunks = []
for i in range(num_chunks): start = i * chunk_size end = start + chunk_size chunk = audio[start:end] # Pad the last chunk with zeros if necessary if len(chunk) < chunk_size: chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode="constant") chunks.append(chunk) return chunksdef process_audio(model, audio, chunk_size=16000): """Process the audio using the ONNX model.""" # Split the audio into chunks chunks = split_audio(audio, chunk_size)
# Process each chunk processed_chunks = [] for chunk in chunks: # Prepare the input tensor input_data = np.expand_dims(chunk, axis=0) # Add batch dimension input_data = np.expand_dims(input_data, axis=0) # Add channel dimension # Get input and output names input_name = model.get_inputs()[0].name output_name = model.get_outputs()[0].name # Run the model output = model.run([output_name], {input_name: input_data})[0] # Postprocess the output output_audio = np.squeeze(output) # Remove batch and channel dimensions processed_chunks.append(output_audio) # Combine the processed chunks processed_audio = np.concatenate(processed_chunks) return processed_audiodef save_audio(audio, output_path, sample_rate): """Save the processed audio to a WAV file.""" sf.write(output_path, audio, sample_rate)
def main(model_path, audio_path, output_path, chunk_size=16000): """Main function to process a WAV file using the ONNX model.""" # Load the model model = load_model(model_path)
# Load and preprocess the audio audio, sample_rate = load_audio(audio_path) # Process the audio processed_audio = process_audio(model, audio, chunk_size) # Save the processed audio save_audio(processed_audio, output_path, sample_rate) print(f"Processed audio saved to {output_path}")Example usage
model_path = "./model/deepFilter/DeepFilterNet2_onnx/export/erb_dec.onnx" # Path to the ONNX model audio_path = "./test_audio/test1.wav" # Path to the input WAV file output_path = "./output.wav" # Path to save the processed audio
main(model_path, audio_path, output_path)
How did you run with these two models. I tried to make an example in python to run them. But it has some "ValueError: Required inputs (['e3', 'e2', 'e1', 'e0']) are missing from input feed (['emb']).".
import onnxruntime as ort import numpy as np import soundfile as sf from scipy import signal
def load_model(model_path): """Load the ONNX model.""" session = ort.InferenceSession(model_path) return session
def load_audio(audio_path, target_sample_rate=48000): """Load and preprocess the audio file.""" audio, sample_rate = sf.read(audio_path)
# Ensure the audio is mono if len(audio.shape) > 1: audio = np.mean(audio, axis=1) # Convert stereo to mono # Resample to the target sample rate (e.g., 48 kHz) if sample_rate != target_sample_rate: audio = signal.resample(audio, int(len(audio) * target_sample_rate / sample_rate)) sample_rate = target_sample_rate # Normalize the audio to the range [-1, 1] audio = audio.astype(np.float32) audio /= np.max(np.abs(audio)) return audio, sample_ratedef split_audio(audio, chunk_size=16000): """ Split the audio into fixed-length chunks. If the audio length is not a multiple of chunk_size, pad the last chunk with zeros. """ num_chunks = int(np.ceil(len(audio) / chunk_size)) chunks = []
for i in range(num_chunks): start = i * chunk_size end = start + chunk_size chunk = audio[start:end] # Pad the last chunk with zeros if necessary if len(chunk) < chunk_size: chunk = np.pad(chunk, (0, chunk_size - len(chunk)), mode="constant") chunks.append(chunk) return chunksdef process_audio(model, audio, chunk_size=16000): """Process the audio using the ONNX model.""" # Split the audio into chunks chunks = split_audio(audio, chunk_size)
# Process each chunk processed_chunks = [] for chunk in chunks: # Prepare the input tensor input_data = np.expand_dims(chunk, axis=0) # Add batch dimension input_data = np.expand_dims(input_data, axis=0) # Add channel dimension # Get input and output names input_name = model.get_inputs()[0].name output_name = model.get_outputs()[0].name # Run the model output = model.run([output_name], {input_name: input_data})[0] # Postprocess the output output_audio = np.squeeze(output) # Remove batch and channel dimensions processed_chunks.append(output_audio) # Combine the processed chunks processed_audio = np.concatenate(processed_chunks) return processed_audiodef save_audio(audio, output_path, sample_rate): """Save the processed audio to a WAV file.""" sf.write(output_path, audio, sample_rate)
def main(model_path, audio_path, output_path, chunk_size=16000): """Main function to process a WAV file using the ONNX model.""" # Load the model model = load_model(model_path)
# Load and preprocess the audio audio, sample_rate = load_audio(audio_path) # Process the audio processed_audio = process_audio(model, audio, chunk_size) # Save the processed audio save_audio(processed_audio, output_path, sample_rate) print(f"Processed audio saved to {output_path}")Example usage
model_path = "./model/deepFilter/DeepFilterNet2_onnx/export/erb_dec.onnx" # Path to the ONNX model audio_path = "./test_audio/test1.wav" # Path to the input WAV file output_path = "./output.wav" # Path to save the processed audio
main(model_path, audio_path, output_path)
Have you find any solution to do inference using ONNX file ?
This issue is stale because it has been open 90 days with no activity. Remove stale label or comment or this will be closed in 7 days.