[Bug] Error accessing gradients when kernel partition size is larger than training set size for multiple gpu example
🐛 Bug
I am replicating the Exact GP Regression with Multiple GPUs and Kernel Partitioning notebook. However, seems like making the kernel partition size larger than the number of training data (I have reduced the training data size from the example notebook) results in an error when accessing gradients.
To reproduce
** Code snippet to reproduce **
import torch
import gpytorch
import sys
from LBFGS import FullBatchLBFGS
import os
from scipy.io import loadmat
import numpy as np
KERNEL_CHECKPOINT_SIZE = 5000
dataset = 'protein'
data = torch.Tensor(loadmat(f'{dataset}.mat')['data'])
N = data.shape[0]
# make train/val/test
n_train = int(0.1 * N)
n_test = int(0.05 * N)
print(f"N train: {n_train}") # 4573
print(f"N test: {n_test}") # 2286
train_x, train_y = data[:n_train, :-1], data[:n_train, -1]
test_x, test_y = data[n_test:, :-1], data[n_test:, -1]
# normalize features
mean = train_x.mean(dim=-2, keepdim=True)
std = train_x.std(dim=-2, keepdim=True) + 1e-6 # prevent dividing by 0
train_x = (train_x - mean) / std
test_x = (test_x - mean) / std
# normalize labels
mean, std = train_y.mean(),train_y.std()
train_y = (train_y - mean) / std
test_y = (test_y - mean) / std
# make continguous
train_x, train_y = train_x.contiguous(), train_y.contiguous()
test_x, test_y = test_x.contiguous(), test_y.contiguous()
output_device = torch.device('cuda:0')
train_x, train_y = train_x.to(output_device), train_y.to(output_device)
test_x, test_y = test_x.to(output_device), test_y.to(output_device)
n_devices = torch.cuda.device_count()
print('Planning to run on {} GPUs.'.format(n_devices))
class ExactGPModel(gpytorch.models.ExactGP):
def __init__(self, train_x, train_y, likelihood, n_devices):
super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ConstantMean()
base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
self.covar_module = gpytorch.kernels.MultiDeviceKernel(
base_covar_module, device_ids=range(n_devices),
output_device=output_device
)
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
def train(train_x,
train_y,
n_devices,
output_device,
checkpoint_size,
preconditioner_size,
n_training_iter,
):
likelihood = gpytorch.likelihoods.GaussianLikelihood().to(output_device)
model = ExactGPModel(train_x, train_y, likelihood, n_devices).to(output_device)
model.train()
likelihood.train()
optimizer = FullBatchLBFGS(model.parameters(), lr=0.1)
# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)
with gpytorch.beta_features.checkpoint_kernel(checkpoint_size), \
gpytorch.settings.max_preconditioner_size(preconditioner_size):
def closure():
optimizer.zero_grad()
output = model(train_x)
loss = -mll(output, train_y)
return loss
loss = closure()
loss.backward()
for i in range(n_training_iter):
options = {'closure': closure, 'current_loss': loss, 'max_ls': 10}
loss, _, _, _, _, _, _, fail = optimizer.step(options)
print('Iter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % (
i + 1, n_training_iter, loss.item(),
model.covar_module.module.base_kernel.lengthscale.item(),
model.likelihood.noise.item()
))
if fail:
print('Convergence reached!')
break
print(f"Finished training on {train_x.size(0)} data points using {n_devices} GPUs.")
return model, likelihood
model, likelihood = train(train_x, train_y,
n_devices=n_devices, output_device=output_device,
checkpoint_size=KERNEL_CHECKPOINT_SIZE,
preconditioner_size=100,
n_training_iter=20)
# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.beta_features.checkpoint_kernel(1000):
# Make predictions on a small number of test points to get the test time caches computed
latent_pred = model(test_x[:2, :])
del latent_pred # We don't care about these predictions, we really just want the caches.
with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.beta_features.checkpoint_kernel(1000):
latent_pred = model(test_x)
test_rmse = torch.sqrt(torch.mean(torch.pow(latent_pred.mean - test_y, 2)))
print(f"Test RMSE: {test_rmse.item()}")
** Stack trace/error message **
Traceback (most recent call last):
File "................./multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
model, likelihood = train(train_x, train_y,
File "................./multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 90, in train
loss.backward()
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
return user_fn(self, *args)
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_inv_quad_logdet.py", line 209, in backward
matrix_arg_grads = lazy_tsr._quad_form_derivative(left_factors, right_factors)
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/sum_lazy_tensor.py", line 47, in _quad_form_derivative
return tuple(
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/sum_lazy_tensor.py", line 48, in <genexpr>
var for lazy_tensor in self.lazy_tensors for var in lazy_tensor._quad_form_derivative(left_vecs, right_vecs)
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 235, in _quad_form_derivative
x1.grad = torch.cat([sub_x1.grad.data for sub_x1 in sub_x1s], dim=-2)
File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 235, in <listcomp>
x1.grad = torch.cat([sub_x1.grad.data for sub_x1 in sub_x1s], dim=-2)
AttributeError: 'NoneType' object has no attribute 'data'
Expected Behavior
sub_x1.grad should not be NoneType, but it should instead contain the gradinet and model should be made. When I change the variable KERNEL_CHECKPOINT_SIZE = 3000 (so less than the training set size), then a model is made and the correct output is produced:
Iter 1/20 - Loss: 1.176 lengthscale: 0.693 noise: 0.693
Convergence reached!
Finished training on 4573 data points using 2 GPUs.
Test RMSE: 0.6877772808074951
System information
- GPyTorch Version: 1.7.0
- PyTorch Version: 1.12.0+cu116
- Linux / CentOS
- Code running on 2 V100 16 GB GPUs
Also related to the bug above, if I set KERNEL_CHECKPOINT_SIZE = 0, which should mean that no partitioning is used (as in the tutorial), then I get the following error:
Traceback (most recent call last):
File "..................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
model, likelihood = train(train_x, train_y,
File "..................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 90, in train
loss.backward()
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
return user_fn(self, *args)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_pivoted_cholesky.py", line 107, in backward
Krows = apply_permutation(matrix, full_permutation, short_permutation)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/utils/permutation.py", line 79, in apply_permutation
return delazify(matrix.__getitem__((*batch_idx, left_permutation.unsqueeze(-1), right_permutation.unsqueeze(-2))))
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 2268, in __getitem__
res = self._get_indices(row_index, col_index, *batch_indices)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/cat_lazy_tensor.py", line 184, in _get_indices
return torch.cat(res_list).view(target_shape).to(self.device)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_cat)
One more issue that is related to this. Using a larger preconditioner size also breaks down because the rows that are returned by apply_permutation can be on different devices, so then row.gather results in a Runtime error. Using a very small preconditioner size (I tried setting the preconditioner size to 1 and 2) did not give that error below as the rows that were returned were on the same device.
https://github.com/cornellius-gp/gpytorch/blob/45e560c3417cb970c3a402f8a1a92f87b733e470/gpytorch/functions/_pivoted_cholesky.py#L67-L70
Traceback (most recent call last):
File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
model, likelihood = train(train_x, train_y,
File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 89, in train
loss = closure()
File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 86, in closure
loss = -mll(output, train_y)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/module.py", line 30, in __call__
outputs = self.forward(*inputs, **kwargs)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py", line 62, in forward
res = output.log_prob(target)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/distributions/multivariate_normal.py", line 169, in log_prob
inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 1338, in inv_quad_logdet
preconditioner, precond_lt, logdet_p = self._preconditioner()
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/added_diag_lazy_tensor.py", line 100, in _preconditioner
self._piv_chol_self = self._lazy_tensor.pivoted_cholesky(rank=max_iter)
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 1538, in pivoted_cholesky
res, pivots = func(self.representation_tree(), rank, error_tol, *self.representation())
File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_pivoted_cholesky.py", line 70, in forward
L_m_new = row.gather(-1, pi_i)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_gather)
Closing because checkpointing is now deprecated (as of v1.11)