descript-audio-codec How to compress stereo sound by model.encode

i found that i can not encode strereo sound by moddel.encode

Nov 23 '23 03:11 tanggang1997

I'm finding the same thing. Using the example code from the README, the model.encode() fails if input.wav has two channels:

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Cell In[4], line 9
      6 signal.to(model.device)
      8 x = model.preprocess(signal.audio_data, signal.sample_rate)
----> 9 z, codes, latents, _, _ = model.encode(x)
     11 # Decode audio signal
     12 y = model.decode(z)

File ~/github/descript-audio-codec/dac/model/dac.py:243, in DAC.encode(self, audio_data, n_quantizers)
    209 def encode(
    210     self,
    211     audio_data: torch.Tensor,
    212     n_quantizers: int = None,
    213 ):
    214     """Encode given audio data and return quantized latent codes
    215 
    216     Parameters
   (...)
    241             Number of samples in input audio
    242     """
--> 243     z = self.encoder(audio_data)
    244     z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
    245         z, n_quantizers
    246     )
    247     return z, codes, latents, commitment_loss, codebook_loss

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~/github/descript-audio-codec/dac/model/dac.py:91, in Encoder.forward(self, x)
     90 def forward(self, x):
---> 91     return self.block(x)

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
   1522 # If we don't have any hooks, we want to skip the rest of the logic in
   1523 # this function, and just call forward.
   1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1525         or _global_backward_pre_hooks or _global_backward_hooks
   1526         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527     return forward_call(*args, **kwargs)
   1529 try:
   1530     result = None

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/container.py:215, in Sequential.forward(self, input)
    213 def forward(self, input):
    214     for module in self:
--> 215         input = module(input)
    216     return input

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
   1516     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1517 else:
-> 1518     return self._call_impl(*args, **kwargs)

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1568, in Module._call_impl(self, *args, **kwargs)
   1565     bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks)
   1566     args = bw_hook.setup_input_hook(args)
-> 1568 result = forward_call(*args, **kwargs)
   1569 if _global_forward_hooks or self._forward_hooks:
   1570     for hook_id, hook in (
   1571         *_global_forward_hooks.items(),
   1572         *self._forward_hooks.items(),
   1573     ):
   1574         # mark that always called hook is run

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/conv.py:310, in Conv1d.forward(self, input)
    309 def forward(self, input: Tensor) -> Tensor:
--> 310     return self._conv_forward(input, self.weight, self.bias)

File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/conv.py:306, in Conv1d._conv_forward(self, input, weight, bias)
    302 if self.padding_mode != 'zeros':
    303     return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
    304                     weight, bias, self.stride,
    305                     _single(0), self.dilation, self.groups)
--> 306 return F.conv1d(input, weight, bias, self.stride,
    307                 self.padding, self.dilation, self.groups)

RuntimeError: Given groups=1, weight of size [64, 1, 7], expected input[1, 2, 7911424] to have 1 channels, but got 2 channels instead

@eeishaan, @pseeth ,any info on how to tell it we want stereo?

Dec 26 '23 01:12 drscotthawley

I believe DAC works on the individual channel level. So you need to feed it each channel individually.

If you have a batch size of 1, just flipping the batch and channel dimension will solve it. Like this:

model = dac.DAC.load("weights.pth")
model.to("cuda")

signal = AudioSignal(args.audio_file)
signal.to(model.device)

x = model.preprocess(signal.audio_data, signal.sample_rate)

# Flip batch and channel dimension.
x = x.transpose(0, 1)
z, codes, latents, _, _ = model.encode(x)
y = model.decode(z)
# Remove batch
y = y.squeeze(1)
torchaudio.save('output_ft.wav', y.cpu(), signal.sample_rate)

Jan 08 '24 01:01 lextoumbourou