gpytorch icon indicating copy to clipboard operation
gpytorch copied to clipboard

[Bug?] Natural gradient descent and OrthogonallyDecoupledVariationalStrategy

Open jrojsel opened this issue 5 years ago • 2 comments

🐛 Bug

I'd like to use natural gradient descent together with the orthogonally decoupled variational strategy, so I ran the NGD example notebook but with the OrthDecoupledApproximateGP from the variational strategy example notebook.

Perhaps I am doing something wrong (the NGD var. dist. and the orth. decoupled var. strat. might not even be compatible with each other), but it seems unstable.

To reproduce

** Code snippet to reproduce **

import tqdm
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
import urllib.request
import os
from scipy.io import loadmat
from math import floor

# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)

if not smoke_test and not os.path.isfile('../elevators.mat'):
    print('Downloading \'elevators\' UCI dataset...')
    urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1jhWL3YUHvXIaftia4qeAyDwVxo6j1alk', '../elevators.mat')

if smoke_test:  # this is for running the notebook in our testing framework
    X, y = torch.randn(1000, 3), torch.randn(1000)
else:
    data = torch.Tensor(loadmat('../elevators.mat')['data'])
    X = data[:, :-1]
    X = X - X.min(0)[0]
    X = 2 * (X / X.max(0)[0]) - 1
    y = data[:, -1]

train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = y[:train_n].contiguous()

test_x = X[train_n:, :].contiguous()
test_y = y[train_n:].contiguous()

if torch.cuda.is_available():
    train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()

from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)

test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)

def make_orthogonal_vs(model, train_x):
    mean_inducing_points = torch.randn(1000, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
    covar_inducing_points = torch.randn(100, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)

    covar_variational_strategy = gpytorch.variational.VariationalStrategy(
        model, covar_inducing_points,
        gpytorch.variational.CholeskyVariationalDistribution(covar_inducing_points.size(-2)),
        learn_inducing_locations=True
    )

    variational_strategy = gpytorch.variational.OrthogonallyDecoupledVariationalStrategy(
        covar_variational_strategy, mean_inducing_points,
        gpytorch.variational.DeltaVariationalDistribution(mean_inducing_points.size(-2)),
    )
    return variational_strategy

class GPModel(gpytorch.models.ApproximateGP):
    def __init__(self, inducing_points):
        variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(-2))
        variational_strategy = make_orthogonal_vs(self, train_x)
        super().__init__(variational_strategy)
        self.mean_module = gpytorch.means.ConstantMean()
        self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())

    def forward(self, x):
        mean_x = self.mean_module(x)
        covar_x = self.covar_module(x)
        return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)

inducing_points = train_x[:500, :]
model = GPModel(inducing_points=inducing_points)
likelihood = gpytorch.likelihoods.GaussianLikelihood()

if torch.cuda.is_available():
    model = model.cuda()
    likelihood = likelihood.cuda()

variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.1)

hyperparameter_optimizer = torch.optim.Adam([
    {'params': model.hyperparameters()},
    {'params': likelihood.parameters()},
], lr=0.01)

model.train()
likelihood.train()
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))

num_epochs = 1 if smoke_test else 4
epochs_iter = tqdm.notebook.tqdm(range(num_epochs), desc="Epoch")
for i in epochs_iter:
    minibatch_iter = tqdm.notebook.tqdm(train_loader, desc="Minibatch", leave=False)
    
    for x_batch, y_batch in minibatch_iter:
        ### Perform NGD step to optimize variational parameters
        variational_ngd_optimizer.zero_grad()
        hyperparameter_optimizer.zero_grad()
        output = model(x_batch)
        loss = -mll(output, y_batch)
        minibatch_iter.set_postfix(loss=loss.item())
        loss.backward()
        variational_ngd_optimizer.step()
        hyperparameter_optimizer.step()

model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
    for x_batch, y_batch in test_loader:
        preds = model(x_batch)
        means = torch.cat([means, preds.mean.cpu()])
means = means[1:]
print('Test MAE: {}'.format(torch.mean(torch.abs(means - test_y.cpu()))))

** Stack trace/error message **

Traceback (most recent call last):

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 27, in psd_safe_cholesky
    L = torch.cholesky(A, upper=upper, out=out)

RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.


During handling of the above exception, another exception occurred:

Traceback (most recent call last):

  File "C:\Users\fasx\Desktop\jrojsel\natural_gradients_test.py", line 109, in <module>
    output = model(x_batch)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\models\approximate_gp.py", line 81, in __call__
    return self.variational_strategy(inputs, prior=prior)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 135, in __call__
    x, inducing_points, inducing_values=variational_dist_u.mean, variational_inducing_covar=None

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
    outputs = self.forward(*inputs, **kwargs)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\orthogonally_decoupled_variational_strategy.py", line 74, in forward
    full_output = self.model(torch.cat([x, inducing_points], dim=-2))

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 166, in __call__
    return super().__call__(x, prior=prior)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 131, in __call__
    variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
    outputs = self.forward(*inputs, **kwargs)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 99, in forward
    L = self._cholesky_factor(induc_induc_covar)

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\memoize.py", line 68, in g
    return _add_to_cache_ignore_args(self, cache_name, method(self, *args, **kwargs))

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 72, in _cholesky_factor
    L = psd_safe_cholesky(delazify(induc_induc_covar).double(), jitter=settings.cholesky_jitter.value())

  File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 33, in psd_safe_cholesky
    f"cholesky_cpu: {isnan.sum().item()} of {A.numel()} elements of the {A.shape} tensor are NaN."

NanError: cholesky_cpu: 10000 of 10000 elements of the torch.Size([100, 100]) tensor are NaN.

Expected Behavior

Optimization should not crash

System information

Please complete the following information:

  • GPyTorch 1.1.1
  • PyTorch 1.5.1+cpu
  • Windows 10

Additional context

Add any other context about the problem here.

jrojsel avatar Aug 26 '20 18:08 jrojsel

At the moment this is not supported. NGD only works with NaturalVariationalDistribution and TrilNaturalVariationalDistribution.

gpleiss avatar Aug 28 '20 13:08 gpleiss

Has there been more thought on this @gpleiss since the issue was opened? There are recent developments (https://arxiv.org/abs/1910.10596) on this orthogonal decoupling approach which allow greater flexibility that would be great to see implemented, especially for deep GPs.

mrlj-hash avatar Sep 23 '24 04:09 mrlj-hash