[Bug?] Natural gradient descent and OrthogonallyDecoupledVariationalStrategy
🐛 Bug
I'd like to use natural gradient descent together with the orthogonally decoupled variational strategy, so I ran the NGD example notebook but with the OrthDecoupledApproximateGP from the variational strategy example notebook.
Perhaps I am doing something wrong (the NGD var. dist. and the orth. decoupled var. strat. might not even be compatible with each other), but it seems unstable.
To reproduce
** Code snippet to reproduce **
import tqdm
import math
import torch
import gpytorch
from matplotlib import pyplot as plt
import urllib.request
import os
from scipy.io import loadmat
from math import floor
# this is for running the notebook in our testing framework
smoke_test = ('CI' in os.environ)
if not smoke_test and not os.path.isfile('../elevators.mat'):
print('Downloading \'elevators\' UCI dataset...')
urllib.request.urlretrieve('https://drive.google.com/uc?export=download&id=1jhWL3YUHvXIaftia4qeAyDwVxo6j1alk', '../elevators.mat')
if smoke_test: # this is for running the notebook in our testing framework
X, y = torch.randn(1000, 3), torch.randn(1000)
else:
data = torch.Tensor(loadmat('../elevators.mat')['data'])
X = data[:, :-1]
X = X - X.min(0)[0]
X = 2 * (X / X.max(0)[0]) - 1
y = data[:, -1]
train_n = int(floor(0.8 * len(X)))
train_x = X[:train_n, :].contiguous()
train_y = y[:train_n].contiguous()
test_x = X[train_n:, :].contiguous()
test_y = y[train_n:].contiguous()
if torch.cuda.is_available():
train_x, train_y, test_x, test_y = train_x.cuda(), train_y.cuda(), test_x.cuda(), test_y.cuda()
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(train_x, train_y)
train_loader = DataLoader(train_dataset, batch_size=1024, shuffle=True)
test_dataset = TensorDataset(test_x, test_y)
test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False)
def make_orthogonal_vs(model, train_x):
mean_inducing_points = torch.randn(1000, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
covar_inducing_points = torch.randn(100, train_x.size(-1), dtype=train_x.dtype, device=train_x.device)
covar_variational_strategy = gpytorch.variational.VariationalStrategy(
model, covar_inducing_points,
gpytorch.variational.CholeskyVariationalDistribution(covar_inducing_points.size(-2)),
learn_inducing_locations=True
)
variational_strategy = gpytorch.variational.OrthogonallyDecoupledVariationalStrategy(
covar_variational_strategy, mean_inducing_points,
gpytorch.variational.DeltaVariationalDistribution(mean_inducing_points.size(-2)),
)
return variational_strategy
class GPModel(gpytorch.models.ApproximateGP):
def __init__(self, inducing_points):
variational_distribution = gpytorch.variational.NaturalVariationalDistribution(inducing_points.size(-2))
variational_strategy = make_orthogonal_vs(self, train_x)
super().__init__(variational_strategy)
self.mean_module = gpytorch.means.ConstantMean()
self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel())
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
inducing_points = train_x[:500, :]
model = GPModel(inducing_points=inducing_points)
likelihood = gpytorch.likelihoods.GaussianLikelihood()
if torch.cuda.is_available():
model = model.cuda()
likelihood = likelihood.cuda()
variational_ngd_optimizer = gpytorch.optim.NGD(model.variational_parameters(), num_data=train_y.size(0), lr=0.1)
hyperparameter_optimizer = torch.optim.Adam([
{'params': model.hyperparameters()},
{'params': likelihood.parameters()},
], lr=0.01)
model.train()
likelihood.train()
mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=train_y.size(0))
num_epochs = 1 if smoke_test else 4
epochs_iter = tqdm.notebook.tqdm(range(num_epochs), desc="Epoch")
for i in epochs_iter:
minibatch_iter = tqdm.notebook.tqdm(train_loader, desc="Minibatch", leave=False)
for x_batch, y_batch in minibatch_iter:
### Perform NGD step to optimize variational parameters
variational_ngd_optimizer.zero_grad()
hyperparameter_optimizer.zero_grad()
output = model(x_batch)
loss = -mll(output, y_batch)
minibatch_iter.set_postfix(loss=loss.item())
loss.backward()
variational_ngd_optimizer.step()
hyperparameter_optimizer.step()
model.eval()
likelihood.eval()
means = torch.tensor([0.])
with torch.no_grad():
for x_batch, y_batch in test_loader:
preds = model(x_batch)
means = torch.cat([means, preds.mean.cpu()])
means = means[1:]
print('Test MAE: {}'.format(torch.mean(torch.abs(means - test_y.cpu()))))
** Stack trace/error message **
Traceback (most recent call last):
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 27, in psd_safe_cholesky
L = torch.cholesky(A, upper=upper, out=out)
RuntimeError: cholesky_cpu: U(1,1) is zero, singular U.
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:\Users\fasx\Desktop\jrojsel\natural_gradients_test.py", line 109, in <module>
output = model(x_batch)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\models\approximate_gp.py", line 81, in __call__
return self.variational_strategy(inputs, prior=prior)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 135, in __call__
x, inducing_points, inducing_values=variational_dist_u.mean, variational_inducing_covar=None
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
outputs = self.forward(*inputs, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\orthogonally_decoupled_variational_strategy.py", line 74, in forward
full_output = self.model(torch.cat([x, inducing_points], dim=-2))
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 166, in __call__
return super().__call__(x, prior=prior)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\_variational_strategy.py", line 131, in __call__
variational_inducing_covar=variational_dist_u.lazy_covariance_matrix,
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\module.py", line 28, in __call__
outputs = self.forward(*inputs, **kwargs)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 99, in forward
L = self._cholesky_factor(induc_induc_covar)
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\memoize.py", line 68, in g
return _add_to_cache_ignore_args(self, cache_name, method(self, *args, **kwargs))
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\variational\variational_strategy.py", line 72, in _cholesky_factor
L = psd_safe_cholesky(delazify(induc_induc_covar).double(), jitter=settings.cholesky_jitter.value())
File "C:\ProgramData\Anaconda3\lib\site-packages\gpytorch\utils\cholesky.py", line 33, in psd_safe_cholesky
f"cholesky_cpu: {isnan.sum().item()} of {A.numel()} elements of the {A.shape} tensor are NaN."
NanError: cholesky_cpu: 10000 of 10000 elements of the torch.Size([100, 100]) tensor are NaN.
Expected Behavior
Optimization should not crash
System information
Please complete the following information:
- GPyTorch 1.1.1
- PyTorch 1.5.1+cpu
- Windows 10
Additional context
Add any other context about the problem here.
At the moment this is not supported. NGD only works with NaturalVariationalDistribution and TrilNaturalVariationalDistribution.
Has there been more thought on this @gpleiss since the issue was opened? There are recent developments (https://arxiv.org/abs/1910.10596) on this orthogonal decoupling approach which allow greater flexibility that would be great to see implemented, especially for deep GPs.