descript-audio-codec
descript-audio-codec copied to clipboard
How to compress stereo sound by model.encode
i found that i can not encode strereo sound by moddel.encode
I'm finding the same thing. Using the example code from the README, the model.encode() fails if input.wav has two channels:
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Cell In[4], line 9
6 signal.to(model.device)
8 x = model.preprocess(signal.audio_data, signal.sample_rate)
----> 9 z, codes, latents, _, _ = model.encode(x)
11 # Decode audio signal
12 y = model.decode(z)
File ~/github/descript-audio-codec/dac/model/dac.py:243, in DAC.encode(self, audio_data, n_quantizers)
209 def encode(
210 self,
211 audio_data: torch.Tensor,
212 n_quantizers: int = None,
213 ):
214 """Encode given audio data and return quantized latent codes
215
216 Parameters
(...)
241 Number of samples in input audio
242 """
--> 243 z = self.encoder(audio_data)
244 z, codes, latents, commitment_loss, codebook_loss = self.quantizer(
245 z, n_quantizers
246 )
247 return z, codes, latents, commitment_loss, codebook_loss
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/github/descript-audio-codec/dac/model/dac.py:91, in Encoder.forward(self, x)
90 def forward(self, x):
---> 91 return self.block(x)
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1527, in Module._call_impl(self, *args, **kwargs)
1522 # If we don't have any hooks, we want to skip the rest of the logic in
1523 # this function, and just call forward.
1524 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1525 or _global_backward_pre_hooks or _global_backward_hooks
1526 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1527 return forward_call(*args, **kwargs)
1529 try:
1530 result = None
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/container.py:215, in Sequential.forward(self, input)
213 def forward(self, input):
214 for module in self:
--> 215 input = module(input)
216 return input
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1518, in Module._wrapped_call_impl(self, *args, **kwargs)
1516 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1517 else:
-> 1518 return self._call_impl(*args, **kwargs)
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/module.py:1568, in Module._call_impl(self, *args, **kwargs)
1565 bw_hook = hooks.BackwardHook(self, full_backward_hooks, backward_pre_hooks)
1566 args = bw_hook.setup_input_hook(args)
-> 1568 result = forward_call(*args, **kwargs)
1569 if _global_forward_hooks or self._forward_hooks:
1570 for hook_id, hook in (
1571 *_global_forward_hooks.items(),
1572 *self._forward_hooks.items(),
1573 ):
1574 # mark that always called hook is run
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/conv.py:310, in Conv1d.forward(self, input)
309 def forward(self, input: Tensor) -> Tensor:
--> 310 return self._conv_forward(input, self.weight, self.bias)
File ~/envs/hs/lib/python3.11/site-packages/torch/nn/modules/conv.py:306, in Conv1d._conv_forward(self, input, weight, bias)
302 if self.padding_mode != 'zeros':
303 return F.conv1d(F.pad(input, self._reversed_padding_repeated_twice, mode=self.padding_mode),
304 weight, bias, self.stride,
305 _single(0), self.dilation, self.groups)
--> 306 return F.conv1d(input, weight, bias, self.stride,
307 self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [64, 1, 7], expected input[1, 2, 7911424] to have 1 channels, but got 2 channels instead
@eeishaan, @pseeth ,any info on how to tell it we want stereo?
I believe DAC works on the individual channel level. So you need to feed it each channel individually.
If you have a batch size of 1, just flipping the batch and channel dimension will solve it. Like this:
model = dac.DAC.load("weights.pth")
model.to("cuda")
signal = AudioSignal(args.audio_file)
signal.to(model.device)
x = model.preprocess(signal.audio_data, signal.sample_rate)
# Flip batch and channel dimension.
x = x.transpose(0, 1)
z, codes, latents, _, _ = model.encode(x)
y = model.decode(z)
# Remove batch
y = y.squeeze(1)
torchaudio.save('output_ft.wav', y.cpu(), signal.sample_rate)