Problems during backward
Hi,
Firts of all, thank you for sharing your code. I would like to use your interpolation during training, but unfortunately, I get the following error message during the backward
File .../lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:143, in Closure.closure(self, *args, **kwargs)
140 self._zero_grad_fn()
142 if self._backward_fn is not None and step_output.closure_loss is not None:
--> 143 self._backward_fn(step_output.closure_loss)
145 return step_output
File ../lib/python3.9/site-packages/pytorch_lightning/loops/optimization/optimizer_loop.py:311, in OptimizerLoop._make_backward_fn.<locals>.backward_fn(loss)
310 def backward_fn(loss: Tensor) -> None:
--> 311 self.trainer._call_strategy_hook("backward", loss, optimizer, opt_idx)
313 # check if model weights are nan
314 if self.trainer._terminate_on_nan:
File .../lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py:1765, in Trainer._call_strategy_hook(self, hook_name, *args, **kwargs)
1762 return
1764 with self.profiler.profile(f"[Strategy]{self.strategy.__class__.__name__}.{hook_name}"):
-> 1765 output = fn(*args, **kwargs)
1767 # restore current_fx when nested context
1768 pl_module._current_fx_name = prev_fx_name
File ../lib/python3.9/site-packages/pytorch_lightning/strategies/strategy.py:168, in Strategy.backward(self, closure_loss, *args, **kwargs)
165 self.pre_backward(closure_loss)
166 closure_loss = self.precision_plugin.pre_backward(self.lightning_module, closure_loss)
--> 168 self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
170 closure_loss = self.precision_plugin.post_backward(self.lightning_module, closure_loss)
171 self.post_backward(closure_loss)
File ../lib/python3.9/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py:80, in PrecisionPlugin.backward(self, model, closure_loss, optimizer, *args, **kwargs)
78 # do backward pass
79 if model is not None and isinstance(model, pl.LightningModule):
---> 80 model.backward(closure_loss, optimizer, *args, **kwargs)
81 else:
82 self._run_backward(closure_loss, *args, **kwargs)
File ../lib/python3.9/site-packages/pytorch_lightning/core/lightning.py:1391, in LightningModule.backward(self, loss, optimizer, optimizer_idx, *args, **kwargs)
1374 def backward(
1375 self, loss: Tensor, optimizer: Optional[Optimizer], optimizer_idx: Optional[int], *args, **kwargs
1376 ) -> None:
1377 """Called to perform backward on the loss returned in :meth:`training_step`. Override this hook with your
1378 own implementation if you need to.
1379
(...)
1389 loss.backward()
1390 """
-> 1391 loss.backward(*args, **kwargs)
File ../lib/python3.9/site-packages/torch/_tensor.py:307, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
298 if has_torch_function_unary(self):
299 return handle_torch_function(
300 Tensor.backward,
301 (self,),
(...)
305 create_graph=create_graph,
306 inputs=inputs)
--> 307 torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File ../lib/python3.9/site-packages/torch/autograd/__init__.py:154, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
151 if retain_graph is None:
152 retain_graph = create_graph
--> 154 Variable._execution_engine.run_backward(
155 tensors, grad_tensors_, retain_graph, create_graph, inputs,
156 allow_unreachable=True, accumulate_grad=True)
File ../lib/python3.9/site-packages/torch/autograd/function.py:199, in BackwardCFunction.apply(self, *args)
195 raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
196 "Function is not allowed. You should only implement one "
197 "of them.")
198 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 199 return user_fn(self, *args)
File ./torchinterp1d/torchinterp1d/interp1d.py:155, in Interp1d.backward(ctx, grad_out)
152 @staticmethod
153 def backward(ctx, grad_out):
154 inputs = ctx.saved_tensors[1:]
--> 155 gradients = torch.autograd.grad(
156 ctx.saved_tensors[0],
157 [i for i in inputs if i is not None],
158 grad_out, retain_graph=True)
159 result = [None, ] * 5
160 pos = 0
File ../lib/python3.9/site-packages/torch/autograd/__init__.py:234, in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
231 if retain_graph is None:
232 retain_graph = create_graph
--> 234 return Variable._execution_engine.run_backward(
235 outputs, grad_outputs_, retain_graph, create_graph,
236 inputs, allow_unused, accumulate_grad=False)
File ../lib/python3.9/site-packages/torch/autograd/function.py:199, in BackwardCFunction.apply(self, *args)
195 raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
196 "Function is not allowed. You should only implement one "
197 "of them.")
198 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 199 return user_fn(self, *args)
File ./torchinterp1d/torchinterp1d/interp1d.py:155, in Interp1d.backward(ctx, grad_out)
152 @staticmethod
153 def backward(ctx, grad_out):
154 inputs = ctx.saved_tensors[1:]
--> 155 gradients = torch.autograd.grad(
156 ctx.saved_tensors[0],
157 [i for i in inputs if i is not None],
158 grad_out, retain_graph=True)
159 result = [None, ] * 5
160 pos = 0
File ../lib/python3.9/site-packages/torch/autograd/__init__.py:234, in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
231 if retain_graph is None:
232 retain_graph = create_graph
--> 234 return Variable._execution_engine.run_backward(
235 outputs, grad_outputs_, retain_graph, create_graph,
236 inputs, allow_unused, accumulate_grad=False)
[... skipping similar frames: BackwardCFunction.apply at line 199 (1546042 times), Interp1d.backward at line 155 (1546042 times), grad at line 234 (1546042 times)]
File ../lib/python3.9/site-packages/torch/autograd/function.py:199, in BackwardCFunction.apply(self, *args)
195 raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
196 "Function is not allowed. You should only implement one "
197 "of them.")
198 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 199 return user_fn(self, *args)
File ../torchinterp1d/torchinterp1d/interp1d.py:155, in Interp1d.backward(ctx, grad_out)
152 @staticmethod
153 def backward(ctx, grad_out):
154 inputs = ctx.saved_tensors[1:]
--> 155 gradients = torch.autograd.grad(
156 ctx.saved_tensors[0],
157 [i for i in inputs if i is not None],
158 grad_out, retain_graph=True)
159 result = [None, ] * 5
160 pos = 0
File ../lib/python3.9/site-packages/torch/autograd/__init__.py:234, in grad(outputs, inputs, grad_outputs, retain_graph, create_graph, only_inputs, allow_unused)
231 if retain_graph is None:
232 retain_graph = create_graph
--> 234 return Variable._execution_engine.run_backward(
235 outputs, grad_outputs_, retain_graph, create_graph,
236 inputs, allow_unused, accumulate_grad=False)
RuntimeError: Resource temporarily unavailable
Here is the snippet showing how the interpolation takes place
t_lin= T_orig[tmp_time].repeat(data_flat.size(2),1).to(device)
for idx_batch in range(segm_frames.size(0)):
x_lin = data_flat[idx_batch,tmp_time].T.to(device)
t_in_lin = T.repeat(data_flat.size(2),1).to(device)
t_in_lin.requires_grad = True
yq_cpu = interp1d(t_lin, x_lin, t_in_lin, None)
segm_frames[idx_batch,:] = yq_cpu.T
Any ideas about the cause of the problem? Thanks!
Maybe related to this issue?
@ahof1704 Hi, Im having the same problem, could you figure out how to solve it? Thanks!
Unfortunately no. I just moved to start using scipy for linear interpolation.
As far as now, I understood that this is due to the batch gradient computation problem that Torch.Autograd.grad has. You can fix it by either using output.sum(dim=0) or calculating the Jacobian(this one is a little tricky)
Sorry for my slow reactivity on this. I would be glad to have some PR if you round a way
I also run into the same problem. The backward() function just pauses for a long time and then pops such error info. Any solutions? Thanks!