botorch
botorch copied to clipboard
[Bug] MultiDeviceKernel not supported in Botorch
🐛 Bug
Hi,
I wanted to train SingleTaskGP
on multiple GPUs as I have got 8 cards on my node. So I searched and found out about the Gpytorch's MultiDeviceKernel
kernel which can be used to accomplish this task. But I couldn't find anything similar in the Botorch modules. So I changed the covar_module
of the SingleTaskGP
to use this specific kernel. But I am getting the Bug:
RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.
I am unable to train my BoTorch's SingleTaskGP
on multiple GPUs with Gpytorch's MultiDeviceKernel
kernel.
To reproduce
** Code snippet to reproduce **
import torch
from botorch import fit_gpytorch_model
from botorch.acquisition.monte_carlo import qExpectedImprovement, qNoisyExpectedImprovement
from botorch.sampling.samplers import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models import SingleTaskGP
import gpytorch
from botorch.optim import optimize_acqf
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bounds = torch.stack([torch.zeros(900), torch.ones(900)]).to(device)
BATCH_SIZE = 20
MC_SAMPLES = 256
train_x_ei = torch.randn((500,3)).to(device)
train_obj_ei = (train_x_ei**2).sum(dim=-1).unsqueeze(-1).to(device)
best_observed_value_ei = train_obj_ei.max().item()
model_ei = SingleTaskGP(train_x_ei, train_obj_ei)
base_covar_module = model_ei.covar_module
covar_module_multi = gpytorch.kernels.MultiDeviceKernel(
base_covar_module, device_ids=range(8),
output_device=device)
model_ei.covar_module = covar_module_multi
mll_ei = ExactMarginalLogLikelihood(model_ei.likelihood, model_ei)
for iteration in range(5):
print(iteration)
fit_gpytorch_model(mll_ei)
qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)
# for best_f, we use the best observed noisy values as an approximation
qEI = qExpectedImprovement(
model=model_ei,
best_f=train_obj_ei.max(),
sampler=qmc_sampler
)
candidates, _ = optimize_acqf(
acq_function=qEI,
bounds=bounds,
q=BATCH_SIZE,
num_restarts=10,
raw_samples=512, # used for intialization heuristic
options={"batch_limit": 5, "maxiter": 200},
)
** Stack trace/error message **
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-1-c510035d82af> in <module>
28 for iteration in range(5):
29 print(iteration)
---> 30 fit_gpytorch_model(mll_ei)
31
32 qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)
/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/fit.py in fit_gpytorch_model(mll, optimizer, **kwargs)
99 mll.model.load_state_dict(original_state_dict)
100 sample_all_priors(mll.model)
--> 101 mll, _ = optimizer(mll, track_iterations=False, **kwargs)
102 if not any(issubclass(w.category, OptimizationWarning) for w in ws):
103 mll.eval()
/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in fit_gpytorch_scipy(mll, bounds, method, options, track_iterations, approx_mll)
224 jac=True,
225 options=options,
--> 226 callback=cb,
227 )
228 iterations = []
/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
608 elif meth == 'l-bfgs-b':
609 return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 610 callback=callback, **options)
611 elif meth == 'tnc':
612 return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,
/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
343 # until the completion of the current minimization iteration.
344 # Overwrite f and g:
--> 345 f, g = func_and_grad(x)
346 elif task_str.startswith(b'NEW_X'):
347 # new iteration
/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
293 else:
294 def func_and_grad(x):
--> 295 f = fun(x, *args)
296 g = jac(x, *args)
297 return f, g
/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
325 def function_wrapper(*wrapper_args):
326 ncalls[0] += 1
--> 327 return function(*(wrapper_args + args))
328
329 return ncalls, function_wrapper
/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
63 def __call__(self, x, *args):
64 self.x = numpy.asarray(x).copy()
---> 65 fg = self.fun(x, *args)
66 self.jac = fg[1]
67 return fg[0]
/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in _scipy_objective_and_grad(x, mll, property_dict)
283 else:
284 raise e # pragma: nocover
--> 285 loss.backward()
286 param_dict = OrderedDict(mll.named_parameters())
287 grad = []
/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
196 products. Defaults to ``False``.
197 """
--> 198 torch.autograd.backward(self, gradient, retain_graph, create_graph)
199
200 def register_hook(self, hook):
/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
98 Variable._execution_engine.run_backward(
99 tensors, grad_tensors, retain_graph, create_graph,
--> 100 allow_unreachable=True) # allow_unreachable flag
101
102
RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.
Expected Behavior
I expected the usage of all the GPUs to train the model so that I can scale it across multiple GPUs for a faster execution and sampling.
System information
Please complete the following information:
- BoTorch Version : 0.2.0
- GPyTorch Version: 1.1.1
- PyTorch Version : 1.5.0
- OS: Ubuntu 18.04
Are you able to successfully use the MultiDeviceKernel
in gpytorch? I just tried running the Simple_MultiGPU_GP_Regression
tutorial, and I'm getting a RuntimeError
during torch.autograd.backward
. Not sure if an issue with the tutorial or with the kernel...
@Balandat
Yeah I am also getting a RuntimeError
while running the tutorial. But this occurs when I put checkpoint_size=10000
.
But when I put checkpoint_size=0
, The training completes without any issues.
This is the exact traceback in my case:
model, likelihood = train(train_x, train_y,
n_devices=n_devices, output_device=output_device,
checkpoint_size=10000,
preconditioner_size=100,
n_training_iter=20)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-16-dba074377e42> in <module>
3 checkpoint_size=10000,
4 preconditioner_size=100,
----> 5 n_training_iter=20)
<ipython-input-14-4da03f9af78d> in train(train_x, train_y, n_devices, output_device, checkpoint_size, preconditioner_size, n_training_iter)
43
44 loss = closure()
---> 45 loss.backward()
46
47 for i in range(n_training_iter):
/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
196 products. Defaults to ``False``.
197 """
--> 198 torch.autograd.backward(self, gradient, retain_graph, create_graph)
199
200 def register_hook(self, hook):
/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
98 Variable._execution_engine.run_backward(
99 tensors, grad_tensors, retain_graph, create_graph,
--> 100 allow_unreachable=True) # allow_unreachable flag
101
102
RuntimeError: start (1250) + length (1250) exceeds dimension size (1250).
Could you upstream raise an issue on the gpytorch github about this? I am not very familiar with this code, hopefully someone else can take a look there to make sure this isn't a larger issue.
@Balandat
Yeah I have raised the same issues on the gpytorch github repo. Hopefully they will be able to tell more about this issue.
Though originally the problem was that there is no Multidevicekernel implementation in BoTorch.
Though originally the problem was that there is no Multidevicekernel implementation in BoTorch.
We don't necessarily aim to re-implement / re-wrap every model/kernel combination in Botorch, that would be a lot of overhead. The goal is rather to make sure that components can be used and combined in a reasonably modular fashion (this is currently not as easy as it should be, and we'll be working on making this easier).
Yeah surely !! So I will revert back to you regarding what they say about this particular problem while using the MultiDeviceKernel
. You can also check in the snippet whether there is any particular problem in initialising the same. Because it can also be an error on my part on how I initialised the covar_module.
import torch
from botorch import fit_gpytorch_model
from botorch.acquisition.monte_carlo import qExpectedImprovement, qNoisyExpectedImprovement
from botorch.sampling.samplers import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models import SingleTaskGP
import gpytorch
from botorch.optim import optimize_acqf
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bounds = torch.stack([torch.zeros(900), torch.ones(900)]).to(device)
BATCH_SIZE = 20
MC_SAMPLES = 256
train_x_ei = torch.randn((500,3)).to(device)
train_obj_ei = (train_x_ei**2).sum(dim=-1).unsqueeze(-1).to(device)
best_observed_value_ei = train_obj_ei.max().item()
model_ei = SingleTaskGP(train_x_ei, train_obj_ei)
base_covar_module = model_ei.covar_module
covar_module_multi = gpytorch.kernels.MultiDeviceKernel(
base_covar_module, device_ids=range(8),
output_device=device)
model_ei.covar_module = covar_module_multi
mll_ei = ExactMarginalLogLikelihood(model_ei.likelihood, model_ei)
for iteration in range(5):
print(iteration)
fit_gpytorch_model(mll_ei)
qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)
# for best_f, we use the best observed noisy values as an approximation
qEI = qExpectedImprovement(
model=model_ei,
best_f=train_obj_ei.max(),
sampler=qmc_sampler
)
candidates, _ = optimize_acqf(
acq_function=qEI,
bounds=bounds,
q=BATCH_SIZE,
num_restarts=10,
raw_samples=512, # used for intialization heuristic
options={"batch_limit": 5, "maxiter": 200},
)
I don't see anything blatantly wrong with this. I'm not very familiar with MultiDeviceKernel
and whether it fully supports generic batching etc. But let's figure out the upstream issue first before going there.
Hi, I want to ask whether you solve this problem? I want to use multi gpu too!
Closing in favor of https://github.com/cornellius-gp/gpytorch/issues/1132