🐛 Bug

I am replicating the Exact GP Regression with Multiple GPUs and Kernel Partitioning notebook. However, seems like making the kernel partition size larger than the number of training data (I have reduced the training data size from the example notebook) results in an error when accessing gradients.

To reproduce

** Code snippet to reproduce **

import torch
import gpytorch
import sys
from LBFGS import FullBatchLBFGS
import os
from scipy.io import loadmat
import numpy as np

KERNEL_CHECKPOINT_SIZE = 5000

dataset = 'protein'
data = torch.Tensor(loadmat(f'{dataset}.mat')['data'])

N = data.shape[0]
# make train/val/test
n_train = int(0.1 * N)
n_test = int(0.05 * N)
print(f"N train: {n_train}") # 4573
print(f"N test: {n_test}") # 2286
train_x, train_y = data[:n_train, :-1], data[:n_train, -1]
test_x, test_y = data[n_test:, :-1], data[n_test:, -1]

# normalize features
mean = train_x.mean(dim=-2, keepdim=True)
std = train_x.std(dim=-2, keepdim=True) + 1e-6 # prevent dividing by 0
train_x = (train_x - mean) / std
test_x = (test_x - mean) / std

# normalize labels
mean, std = train_y.mean(),train_y.std()
train_y = (train_y - mean) / std
test_y = (test_y - mean) / std

# make continguous
train_x, train_y = train_x.contiguous(), train_y.contiguous()
test_x, test_y = test_x.contiguous(), test_y.contiguous()

output_device = torch.device('cuda:0')

train_x, train_y = train_x.to(output_device), train_y.to(output_device)
test_x, test_y = test_x.to(output_device), test_y.to(output_device)

n_devices = torch.cuda.device_count()
print('Planning to run on {} GPUs.'.format(n_devices))

class ExactGPModel(gpytorch.models.ExactGP):
    def __init__(self, train_x, train_y, likelihood, n_devices):
        super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
        self.mean_module = gpytorch.means.ConstantMean()
        base_covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

        self.covar_module = gpytorch.kernels.MultiDeviceKernel(
            base_covar_module, device_ids=range(n_devices),
            output_device=output_device
        )

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

def train(train_x,
          train_y,
          n_devices,
          output_device,
          checkpoint_size,
          preconditioner_size,
          n_training_iter,
):
    likelihood = gpytorch.likelihoods.GaussianLikelihood().to(output_device)
    model = ExactGPModel(train_x, train_y, likelihood, n_devices).to(output_device)
    model.train()
    likelihood.train()

    optimizer = FullBatchLBFGS(model.parameters(), lr=0.1)
    # "Loss" for GPs - the marginal log likelihood
    mll = gpytorch.mlls.ExactMarginalLogLikelihood(likelihood, model)


    with gpytorch.beta_features.checkpoint_kernel(checkpoint_size), \
         gpytorch.settings.max_preconditioner_size(preconditioner_size):

        def closure():
            optimizer.zero_grad()
            output = model(train_x)
            loss = -mll(output, train_y)
            return loss

        loss = closure()
        loss.backward()

        for i in range(n_training_iter):
            options = {'closure': closure, 'current_loss': loss, 'max_ls': 10}
            loss, _, _, _, _, _, _, fail = optimizer.step(options)

            print('Iter %d/%d - Loss: %.3f   lengthscale: %.3f   noise: %.3f' % (
                i + 1, n_training_iter, loss.item(),
                model.covar_module.module.base_kernel.lengthscale.item(),
                model.likelihood.noise.item()
            ))

            if fail:
                print('Convergence reached!')
                break

    print(f"Finished training on {train_x.size(0)} data points using {n_devices} GPUs.")
    return model, likelihood


model, likelihood = train(train_x, train_y,
                          n_devices=n_devices, output_device=output_device,
                          checkpoint_size=KERNEL_CHECKPOINT_SIZE,
                          preconditioner_size=100,
                          n_training_iter=20)

# Get into evaluation (predictive posterior) mode
model.eval()
likelihood.eval()

with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.beta_features.checkpoint_kernel(1000):
    # Make predictions on a small number of test points to get the test time caches computed
    latent_pred = model(test_x[:2, :])
    del latent_pred  # We don't care about these predictions, we really just want the caches.

with torch.no_grad(), gpytorch.settings.fast_pred_var(), gpytorch.beta_features.checkpoint_kernel(1000):
    latent_pred = model(test_x)

test_rmse = torch.sqrt(torch.mean(torch.pow(latent_pred.mean - test_y, 2)))
print(f"Test RMSE: {test_rmse.item()}")

** Stack trace/error message **

Traceback (most recent call last):
  File "................./multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
    model, likelihood = train(train_x, train_y,
  File "................./multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 90, in train
    loss.backward()
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
    return user_fn(self, *args)
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_inv_quad_logdet.py", line 209, in backward
    matrix_arg_grads = lazy_tsr._quad_form_derivative(left_factors, right_factors)
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/sum_lazy_tensor.py", line 47, in _quad_form_derivative
    return tuple(
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/sum_lazy_tensor.py", line 48, in <genexpr>
    var for lazy_tensor in self.lazy_tensors for var in lazy_tensor._quad_form_derivative(left_vecs, right_vecs)
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 235, in _quad_form_derivative
    x1.grad = torch.cat([sub_x1.grad.data for sub_x1 in sub_x1s], dim=-2)
  File "................./.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_evaluated_kernel_tensor.py", line 235, in <listcomp>
    x1.grad = torch.cat([sub_x1.grad.data for sub_x1 in sub_x1s], dim=-2)
AttributeError: 'NoneType' object has no attribute 'data'

Expected Behavior

sub_x1.grad should not be NoneType, but it should instead contain the gradinet and model should be made. When I change the variable KERNEL_CHECKPOINT_SIZE = 3000 (so less than the training set size), then a model is made and the correct output is produced:

Iter 1/20 - Loss: 1.176   lengthscale: 0.693   noise: 0.693
Convergence reached!
Finished training on 4573 data points using 2 GPUs.
Test RMSE: 0.6877772808074951

System information

GPyTorch Version: 1.7.0
PyTorch Version: 1.12.0+cu116
Linux / CentOS
Code running on 2 V100 16 GB GPUs

Jul 01 '22 17:07 m-julian

Also related to the bug above, if I set KERNEL_CHECKPOINT_SIZE = 0, which should mean that no partitioning is used (as in the tutorial), then I get the following error:

Traceback (most recent call last):
  File "..................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
    model, likelihood = train(train_x, train_y,
  File "..................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 90, in train
    loss.backward()
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/__init__.py", line 173, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/torch/autograd/function.py", line 253, in apply
    return user_fn(self, *args)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_pivoted_cholesky.py", line 107, in backward
    Krows = apply_permutation(matrix, full_permutation, short_permutation)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/utils/permutation.py", line 79, in apply_permutation
    return delazify(matrix.__getitem__((*batch_idx, left_permutation.unsqueeze(-1), right_permutation.unsqueeze(-2))))
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 2268, in __getitem__
    res = self._get_indices(row_index, col_index, *batch_indices)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/cat_lazy_tensor.py", line 184, in _get_indices
    return torch.cat(res_list).view(target_shape).to(self.device)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cuda:1! (when checking argument for argument tensors in method wrapper_cat)

Jul 01 '22 20:07 m-julian

One more issue that is related to this. Using a larger preconditioner size also breaks down because the rows that are returned by apply_permutation can be on different devices, so then row.gather results in a Runtime error. Using a very small preconditioner size (I tried setting the preconditioner size to 1 and 2) did not give that error below as the rows that were returned were on the same device.

https://github.com/cornellius-gp/gpytorch/blob/45e560c3417cb970c3a402f8a1a92f87b733e470/gpytorch/functions/_pivoted_cholesky.py#L67-L70

Traceback (most recent call last):
  File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 110, in <module>
    model, likelihood = train(train_x, train_y,
  File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 89, in train
    loss = closure()
  File "/...................../multi_gpu_kernel/example_multi_gpu_gpytorch/example_multi_gpu.py", line 86, in closure
    loss = -mll(output, train_y)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/module.py", line 30, in __call__
    outputs = self.forward(*inputs, **kwargs)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/mlls/exact_marginal_log_likelihood.py", line 62, in forward
    res = output.log_prob(target)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/distributions/multivariate_normal.py", line 169, in log_prob
    inv_quad, logdet = covar.inv_quad_logdet(inv_quad_rhs=diff.unsqueeze(-1), logdet=True)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 1338, in inv_quad_logdet
    preconditioner, precond_lt, logdet_p = self._preconditioner()
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/added_diag_lazy_tensor.py", line 100, in _preconditioner
    self._piv_chol_self = self._lazy_tensor.pivoted_cholesky(rank=max_iter)
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/lazy/lazy_tensor.py", line 1538, in pivoted_cholesky
    res, pivots = func(self.representation_tree(), rank, error_tol, *self.representation())
  File "...................../.venv/venv_gpytorch/lib/python3.9/site-packages/gpytorch/functions/_pivoted_cholesky.py", line 70, in forward
    L_m_new = row.gather(-1, pi_i)
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:1 and cuda:0! (when checking argument for argument index in method wrapper_gather)

Jul 06 '22 14:07 m-julian

Closing because checkpointing is now deprecated (as of v1.11)

Jun 02 '23 22:06 gpleiss

[Bug] Error accessing gradients when kernel partition size is larger than training set size for multiple gpu example

🐛 Bug

To reproduce

Expected Behavior

System information