audio
audio copied to clipboard
large resampling kernels slow ALSO on the forward pass
🐛 Describe the bug
-- i understand i still have to respond to my PR on kernel creation speed (sorry about that!) - but I found another problem when trying to convolve 96kHz 2 sec long room impulse responses into an audio track, which probably extends to resampling: nn.functional.conv1d() crawls to a halt when using large kernels. I posted the bug in pytorch github. https://github.com/pytorch/pytorch/issues/79222 but until they fix it, you guys may want to patch it on your side. Or I can do a PR replacing all conv1ds with fftconv1d. Code below
from typing import Optional
import time
import torch
from torch import Tensor
import torch.nn.functional as F
# pylint: disable=no-member
# pylint: disable=suppressed-message
def fftconv1d(x: Tensor,
weight: Tensor,
bias: Optional[Tensor] = None,
padding: int = 0,
groups: int = 1) -> Tensor:
"""
Args
x: Tensor (batch_size, in_channels, size)
weight: Tensor (out_channels, in_channels//groups, kernel_size)
bias: Tensor [None] out_channels
padding int [0]
groups int [1] in_channels, out _channels must be divisible by groups
# stride and dilation = 1
adapted from https://towardsdatascience.com/fourier-convolutions-in-pytorch-4cbd23c70005
"""
assert x.ndim == 3, "x expedted shape: (N, C, L)"
assert weight.ndim == 3, "weight expected (in_channels, out_channels, kernel)"
_out, _in, _ = weight.shape
if bias is not None:
assert bias.ndim==1 and len(bias) == _out, "bias vector sized as out_channels reqd"
assert not x.shape[1]%groups, f"in_channels must be mod groups {x.shape[1], groups}"
assert not _out%groups, f"out_channels must be mod groups {_out, groups}"
assert x.shape[1] == groups*_in, f"Given groups={groups} and weight {tuple(weight.shape)}, \
expected input {tuple(x.shape)} to have {groups*_in} channels"
out = F.pad(x, [padding, padding])
_pad = out.shape[-1] - weight.shape[-1]
x_rfft = torch.fft.rfftn(out, dim=-1)
w_rfft = torch.fft.rfftn(F.pad(weight, (0, _pad)), dim=-1)
w_rfft.imag *= -1
if groups == 1:
x_rfft = torch.einsum("ab..., cb... -> ac...", x_rfft, w_rfft)
else:
_o = _out//groups
x_rfft = torch.cat([torch.einsum("ab..., cb... -> ac...",
x_rfft[:, _in*g:_in*(g+1)],
w_rfft[_o*g:_o*(g+1)])
for g in range(groups)], dim=1)
out = torch.fft.irfftn(x_rfft, dim=-1)[..., :_pad + 1].contiguous()
if bias is not None:
out = out + bias.view(1, -1, 1)
return out
def _testconv(cuda=True, grad=True, pad=None, out_channels=4, in_channels=2,
batch_size= 20, size = 4096, ksize = 1000, groups=1):
if pad is None:
pad = ksize//2
signal = torch.randn(batch_size, in_channels, size)
if grad:
signal.requires_grad = True
kernel = torch.randn(out_channels, in_channels//groups, ksize)
bias = torch.randn(out_channels)
print(f"\n signal: {tuple(signal.shape)}, kernel: {tuple(kernel.shape)}")
if cuda:
signal = signal.to(device="cuda")
kernel = kernel.to(device="cuda")
bias = bias.to(device="cuda")
_start = time.time()
y0 = F.conv1d(signal, kernel, bias=bias, padding=pad, groups=groups)
if cuda:
torch.cuda.synchronize()
_fconv = time.time()
y2 = fftconv1d(signal, kernel, bias=bias, padding=pad, groups=groups)
if cuda:
torch.cuda.synchronize()
_fftconv = time.time()
_test = f'test: cuda:{cuda}, grad:{grad}, pad{pad}, out:{out_channels}, in{in_channels}, groups{groups}'
print(_test)
_nntime = 1000*(_fconv - _start)
_fftime = 1000*(_fftconv - _fconv)
if _nntime < _fftime:
_nn="\t\t\tnn.Conv1d is faster"
_ff =""
elif _fftime < _nntime:
_nn = ""
_ff = "\t\t\tFFT faster"
print(f" nn.Conv1d() time {1000*(_nntime):.1f} ms {_nn}")
print(f" fftconv1d time {1000*(_fftime):.1f} ms {_ff}")
assert torch.allclose(y0, y2, rtol=1e-3, atol=1e-3), _test
def test_conv_opt():
cuda = [True, False]
grad = [True, False]
padding = [0, None, 100]
groups = [1,2]
out_channels = [4,2]
in_channels = [2,8]
batch_size = 20
size = [4096, 14400]
ksize = [9, 1000]
for p in padding:
for r in grad:
for c in cuda:
for g in groups:
for i in in_channels:
for o in out_channels:
for k in ksize:
for s in size:
_testconv(cuda=c, grad=r, pad=p, out_channels=o, groups=g,
in_channels=i, batch_size=batch_size, size=s, ksize=k)
Versions
(abj) z@zXb:~/work$ python collect_env.py Collecting environment information... PyTorch version: 1.11.0 Is debug build: False CUDA used to build PyTorch: 11.3 ROCM used to build PyTorch: N/A
OS: Ubuntu 18.04.5 LTS (x86_64) GCC version: (Ubuntu 7.5.0-3ubuntu1~18.04) 7.5.0 Clang version: Could not collect CMake version: version 3.22.4 Libc version: glibc-2.27
Python version: 3.9.5 (default, May 18 2021, 19:34:48) [GCC 7.3.0] (64-bit runtime) Python platform: Linux-5.4.0-113-generic-x86_64-with-glibc2.27 Is CUDA available: True CUDA runtime version: 11.6.55 GPU models and configuration: GPU 0: NVIDIA TITAN RTX Nvidia driver version: 510.39.01 cuDNN version: Probably one of the following: /usr/lib/x86_64-linux-gnu/libcudnn.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_adv_infer.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_adv_train.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_cnn_infer.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_cnn_train.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_ops_infer.so.8.0.5 /usr/lib/x86_64-linux-gnu/libcudnn_ops_train.so.8.0.5 HIP runtime version: N/A MIOpen runtime version: N/A Is XNNPACK available: True
Versions of relevant libraries:
[pip3] denoising-diffusion-pytorch==0.7.1.1
[pip3] numpy==1.20.3
[pip3] pytorch-fid==0.2.0
[pip3] pytorch-lightning==1.4.0
[pip3] pytorch3d==0.2.0
[pip3] torch==1.11.0
[pip3] torch-ema==0.2
[pip3] torch-fidelity==0.3.0
[pip3] torch-tb-profiler==0.4.0
[pip3] torchaudio==0.11.0
[pip3] torchmetrics==0.4.1
[pip3] torchnmf==0.3.5.dev0
[pip3] torchvision==0.12.0
[conda] _pytorch_select 0.1 cpu_0
[conda] blas 2.114 mkl conda-forge
[conda] blas-devel 3.9.0 14_linux64_mkl conda-forge
[conda] cudatoolkit 11.3.1 h2bc3f7f_2
[conda] denoising-diffusion-pytorch 0.7.1.1 pypi_0 pypi
[conda] libblas 3.9.0 14_linux64_mkl conda-forge
[conda] libcblas 3.9.0 14_linux64_mkl conda-forge
[conda] liblapack 3.9.0 14_linux64_mkl conda-forge
[conda] liblapacke 3.9.0 14_linux64_mkl conda-forge
[conda] mkl 2022.0.1 h06a4308_117
[conda] mkl-devel 2022.0.1 h66538d2_117
[conda] mkl-include 2022.0.1 h06a4308_117
[conda] numpy 1.19.3 pypi_0 pypi
[conda] pytorch 1.11.0 py3.9_cuda11.3_cudnn8.2.0_0 pytorch
[conda] pytorch-fid 0.2.0 pypi_0 pypi
[conda] pytorch-lightning 1.4.0 pypi_0 pypi
[conda] pytorch-mutex 1.0 cuda pytorch
[conda] pytorch3d 0.2.0 pypi_0 pypi
[conda] torch-ema 0.2 pypi_0 pypi
[conda] torch-fidelity 0.3.0 pypi_0 pypi
[conda] torch-tb-profiler 0.4.0 pypi_0 pypi
[conda] torchaudio 0.11.0 py39_cu113 pytorch
[conda] torchmetrics 0.4.1 pypi_0 pypi
[conda] torchnmf 0.3.5.dev0 pypi_0 pypi
[conda] torchvision 0.12.0 py39_cu113 pytorch
Hi @xvdp, thanks for flagging, and sorry for the late reply. For functions that convolve inputs, I think it would be a good idea to allow users to choose whether to use time-domain-based or fft-based convolutions, perhaps via a use_fft bool flag. Let us know if you'd be interested in implementing this.