optimum Add support for Musicgen Melody in the ONNX export

Feature request

Support Musicgen Melody's ONNX exportation with audio prompting.

Motivation

Currently, Optimum do not support export for Musicgen Melody models, The current implementation in Transformers already supports this model (musicgen-melody) through its specific configuration in configuration_musicgen_melody, but a configuration is missing to fully integrate it into Optimum. ONNX export of musicgen-melody with "audio prompting" would enable audio and text-conditioned music generation, which is essential for advanced music editing and generation applications.

optimum-cli export onnx --model facebook/musicgen-melody musicgen_melody_onnx/

https://github.com/huggingface/transformers/blob/main/src/transformers/models/musicgen_melody/configuration_musicgen_melody.py

Your contribution

I’ll attempt to adapt the current configuration for Musicgen to work with musicgen-melody, adding the audio_decoder dummy generator and configuring the necessary inputs. However, I have limited knowledge in this area, especially around setting up the dummy generator and managing the inputs for the audio encoder. Any guidance or examples would be greatly appreciated.

Nov 13 '24 00:11 rubeniskov

+1, it would be great to export Musicgen Melody models into ONNX

Nov 13 '24 05:11 gabotechs

As far as I see the configuration is pretty similar but with the exception of two new parameters "chroma_length" and "num_chroma" and the removed "is_encoder_decoder" and "classifier_dropout"

diff --git a/MusicgenConfig.txt b/MusicgenMelodyConfig.txt
index 6392f0c..24ac168 100644
--- a/MusicgenConfig.txt
+++ b/MusicgenMelodyConfig.txt
@@ -1,8 +1,8 @@
 {
   "_attn_implementation_autoset": true,
-  "_name_or_path": "facebook/musicgen-small",
+  "_name_or_path": "facebook/musicgen-melody",
   "architectures": [
-    "MusicgenForConditionalGeneration"
+    "MusicgenMelodyForConditionalGeneration"
   ],
   "audio_encoder": {
     "_attn_implementation_autoset": false,
@@ -100,6 +100,7 @@
     "use_causal_conv": false,
     "use_conv_shortcut": false
   },
+  "chroma_length": 235,
   "decoder": {
     "_attn_implementation_autoset": false,
     "_name_or_path": "",
@@ -113,7 +114,6 @@
     "begin_suppress_tokens": null,
     "bos_token_id": 2048,
     "chunk_size_feed_forward": 0,
-    "classifier_dropout": 0,
     "cross_attention_hidden_size": null,
     "decoder_start_token_id": null,
     "diversity_penalty": 0,
@@ -123,11 +123,11 @@
     "encoder_no_repeat_ngram_size": 0,
     "eos_token_id": null,
     "exponential_decay_length_penalty": null,
-    "ffn_dim": 4096,
+    "ffn_dim": 6144,
     "finetuning_task": null,
     "forced_bos_token_id": null,
     "forced_eos_token_id": null,
-    "hidden_size": 1024,
+    "hidden_size": 1536,
     "id2label": {
       "0": "LABEL_0",
       "1": "LABEL_1"
@@ -144,13 +144,13 @@
     "max_length": 20,
     "max_position_embeddings": 2048,
     "min_length": 0,
-    "model_type": "musicgen_decoder",
+    "model_type": "musicgen_melody_decoder",
     "no_repeat_ngram_size": 0,
-    "num_attention_heads": 16,
+    "num_attention_heads": 24,
     "num_beam_groups": 1,
     "num_beams": 1,
     "num_codebooks": 4,
-    "num_hidden_layers": 24,
+    "num_hidden_layers": 48,
     "num_return_sequences": 1,
     "output_attentions": false,
     "output_hidden_states": false,
@@ -181,8 +181,8 @@
     "use_cache": true,
     "vocab_size": 2048
   },
-  "is_encoder_decoder": true,
-  "model_type": "musicgen",
+  "model_type": "musicgen_melody",
+  "num_chroma": 12,
   "text_encoder": {
     "_attn_implementation_autoset": false,
     "_name_or_path": "t5-base",

Nov 13 '24 12:11 rubeniskov