botorch icon indicating copy to clipboard operation
botorch copied to clipboard

[Bug] MultiDeviceKernel not supported in Botorch

Open Bibyutatsu opened this issue 4 years ago • 8 comments

🐛 Bug

Hi, I wanted to train SingleTaskGP on multiple GPUs as I have got 8 cards on my node. So I searched and found out about the Gpytorch's MultiDeviceKernel kernel which can be used to accomplish this task. But I couldn't find anything similar in the Botorch modules. So I changed the covar_module of the SingleTaskGP to use this specific kernel. But I am getting the Bug:

RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.

I am unable to train my BoTorch's SingleTaskGP on multiple GPUs with Gpytorch's MultiDeviceKernel kernel.

To reproduce

** Code snippet to reproduce **

import torch
from botorch import fit_gpytorch_model
from botorch.acquisition.monte_carlo import qExpectedImprovement, qNoisyExpectedImprovement
from botorch.sampling.samplers import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models import SingleTaskGP
import gpytorch
from botorch.optim import optimize_acqf
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bounds = torch.stack([torch.zeros(900), torch.ones(900)]).to(device)
BATCH_SIZE = 20
MC_SAMPLES = 256

train_x_ei = torch.randn((500,3)).to(device)
train_obj_ei = (train_x_ei**2).sum(dim=-1).unsqueeze(-1).to(device)
best_observed_value_ei = train_obj_ei.max().item()
model_ei = SingleTaskGP(train_x_ei, train_obj_ei)
base_covar_module = model_ei.covar_module
covar_module_multi = gpytorch.kernels.MultiDeviceKernel(
    base_covar_module, device_ids=range(8),
    output_device=device)
model_ei.covar_module = covar_module_multi
mll_ei = ExactMarginalLogLikelihood(model_ei.likelihood, model_ei)

for iteration in range(5):
    print(iteration)
    fit_gpytorch_model(mll_ei)
    
    qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)
        
    # for best_f, we use the best observed noisy values as an approximation
    qEI = qExpectedImprovement(
        model=model_ei, 
        best_f=train_obj_ei.max(),
        sampler=qmc_sampler
    )
    
    candidates, _ = optimize_acqf(
        acq_function=qEI,
        bounds=bounds,
        q=BATCH_SIZE,
        num_restarts=10,
        raw_samples=512,  # used for intialization heuristic
        options={"batch_limit": 5, "maxiter": 200},
    )

** Stack trace/error message **

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-1-c510035d82af> in <module>
     28 for iteration in range(5):
     29     print(iteration)
---> 30     fit_gpytorch_model(mll_ei)
     31 
     32     qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/fit.py in fit_gpytorch_model(mll, optimizer, **kwargs)
     99                 mll.model.load_state_dict(original_state_dict)
    100                 sample_all_priors(mll.model)
--> 101             mll, _ = optimizer(mll, track_iterations=False, **kwargs)
    102             if not any(issubclass(w.category, OptimizationWarning) for w in ws):
    103                 mll.eval()

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in fit_gpytorch_scipy(mll, bounds, method, options, track_iterations, approx_mll)
    224             jac=True,
    225             options=options,
--> 226             callback=cb,
    227         )
    228         iterations = []

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/_minimize.py in minimize(fun, x0, args, method, jac, hess, hessp, bounds, constraints, tol, callback, options)
    608     elif meth == 'l-bfgs-b':
    609         return _minimize_lbfgsb(fun, x0, args, jac, bounds,
--> 610                                 callback=callback, **options)
    611     elif meth == 'tnc':
    612         return _minimize_tnc(fun, x0, args, jac, bounds, callback=callback,

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in _minimize_lbfgsb(fun, x0, args, jac, bounds, disp, maxcor, ftol, gtol, eps, maxfun, maxiter, iprint, callback, maxls, **unknown_options)
    343             # until the completion of the current minimization iteration.
    344             # Overwrite f and g:
--> 345             f, g = func_and_grad(x)
    346         elif task_str.startswith(b'NEW_X'):
    347             # new iteration

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/lbfgsb.py in func_and_grad(x)
    293     else:
    294         def func_and_grad(x):
--> 295             f = fun(x, *args)
    296             g = jac(x, *args)
    297             return f, g

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in function_wrapper(*wrapper_args)
    325     def function_wrapper(*wrapper_args):
    326         ncalls[0] += 1
--> 327         return function(*(wrapper_args + args))
    328 
    329     return ncalls, function_wrapper

/opt/conda/envs/torchenv/lib/python3.7/site-packages/scipy/optimize/optimize.py in __call__(self, x, *args)
     63     def __call__(self, x, *args):
     64         self.x = numpy.asarray(x).copy()
---> 65         fg = self.fun(x, *args)
     66         self.jac = fg[1]
     67         return fg[0]

/opt/conda/envs/torchenv/lib/python3.7/site-packages/botorch/optim/fit.py in _scipy_objective_and_grad(x, mll, property_dict)
    283         else:
    284             raise e  # pragma: nocover
--> 285     loss.backward()
    286     param_dict = OrderedDict(mll.named_parameters())
    287     grad = []

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     98     Variable._execution_engine.run_backward(
     99         tensors, grad_tensors, retain_graph, create_graph,
--> 100         allow_unreachable=True)  # allow_unreachable flag
    101 
    102 

RuntimeError: graph_task->future_result_->completed() INTERNAL ASSERT FAILED at /opt/conda/conda-bld/pytorch_1587428398394/work/torch/csrc/autograd/engine.cpp:800, please report a bug to PyTorch.

Expected Behavior

I expected the usage of all the GPUs to train the model so that I can scale it across multiple GPUs for a faster execution and sampling.

System information

Please complete the following information:

  • BoTorch Version : 0.2.0
  • GPyTorch Version: 1.1.1
  • PyTorch Version : 1.5.0
  • OS: Ubuntu 18.04

Bibyutatsu avatar May 03 '20 12:05 Bibyutatsu

Are you able to successfully use the MultiDeviceKernel in gpytorch? I just tried running the Simple_MultiGPU_GP_Regression tutorial, and I'm getting a RuntimeError during torch.autograd.backward. Not sure if an issue with the tutorial or with the kernel...

Balandat avatar May 03 '20 19:05 Balandat

@Balandat Yeah I am also getting a RuntimeError while running the tutorial. But this occurs when I put checkpoint_size=10000.

But when I put checkpoint_size=0, The training completes without any issues.

This is the exact traceback in my case:

model, likelihood = train(train_x, train_y,
                          n_devices=n_devices, output_device=output_device,
                          checkpoint_size=10000,
                          preconditioner_size=100,
                          n_training_iter=20)

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-16-dba074377e42> in <module>
      3                           checkpoint_size=10000,
      4                           preconditioner_size=100,
----> 5                           n_training_iter=20)

<ipython-input-14-4da03f9af78d> in train(train_x, train_y, n_devices, output_device, checkpoint_size, preconditioner_size, n_training_iter)
     43 
     44         loss = closure()
---> 45         loss.backward()
     46 
     47         for i in range(n_training_iter):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    196                 products. Defaults to ``False``.
    197         """
--> 198         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    199 
    200     def register_hook(self, hook):

/opt/conda/envs/torchenv/lib/python3.7/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     98     Variable._execution_engine.run_backward(
     99         tensors, grad_tensors, retain_graph, create_graph,
--> 100         allow_unreachable=True)  # allow_unreachable flag
    101 
    102 

RuntimeError: start (1250) + length (1250) exceeds dimension size (1250).

Bibyutatsu avatar May 04 '20 11:05 Bibyutatsu

Could you upstream raise an issue on the gpytorch github about this? I am not very familiar with this code, hopefully someone else can take a look there to make sure this isn't a larger issue.

Balandat avatar May 04 '20 13:05 Balandat

@Balandat

Yeah I have raised the same issues on the gpytorch github repo. Hopefully they will be able to tell more about this issue.

Though originally the problem was that there is no Multidevicekernel implementation in BoTorch.

Bibyutatsu avatar May 04 '20 14:05 Bibyutatsu

Though originally the problem was that there is no Multidevicekernel implementation in BoTorch.

We don't necessarily aim to re-implement / re-wrap every model/kernel combination in Botorch, that would be a lot of overhead. The goal is rather to make sure that components can be used and combined in a reasonably modular fashion (this is currently not as easy as it should be, and we'll be working on making this easier).

Balandat avatar May 04 '20 14:05 Balandat

Yeah surely !! So I will revert back to you regarding what they say about this particular problem while using the MultiDeviceKernel. You can also check in the snippet whether there is any particular problem in initialising the same. Because it can also be an error on my part on how I initialised the covar_module.

import torch
from botorch import fit_gpytorch_model
from botorch.acquisition.monte_carlo import qExpectedImprovement, qNoisyExpectedImprovement
from botorch.sampling.samplers import SobolQMCNormalSampler
from botorch.exceptions import BadInitialCandidatesWarning
from botorch.models import SingleTaskGP
import gpytorch
from botorch.optim import optimize_acqf
from gpytorch.mlls.sum_marginal_log_likelihood import ExactMarginalLogLikelihood


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
bounds = torch.stack([torch.zeros(900), torch.ones(900)]).to(device)
BATCH_SIZE = 20
MC_SAMPLES = 256

train_x_ei = torch.randn((500,3)).to(device)
train_obj_ei = (train_x_ei**2).sum(dim=-1).unsqueeze(-1).to(device)
best_observed_value_ei = train_obj_ei.max().item()
model_ei = SingleTaskGP(train_x_ei, train_obj_ei)
base_covar_module = model_ei.covar_module
covar_module_multi = gpytorch.kernels.MultiDeviceKernel(
    base_covar_module, device_ids=range(8),
    output_device=device)
model_ei.covar_module = covar_module_multi
mll_ei = ExactMarginalLogLikelihood(model_ei.likelihood, model_ei)

for iteration in range(5):
    print(iteration)
    fit_gpytorch_model(mll_ei)
    
    qmc_sampler = SobolQMCNormalSampler(num_samples=MC_SAMPLES)
        
    # for best_f, we use the best observed noisy values as an approximation
    qEI = qExpectedImprovement(
        model=model_ei, 
        best_f=train_obj_ei.max(),
        sampler=qmc_sampler
    )
    
    candidates, _ = optimize_acqf(
        acq_function=qEI,
        bounds=bounds,
        q=BATCH_SIZE,
        num_restarts=10,
        raw_samples=512,  # used for intialization heuristic
        options={"batch_limit": 5, "maxiter": 200},
    )

Bibyutatsu avatar May 04 '20 14:05 Bibyutatsu

I don't see anything blatantly wrong with this. I'm not very familiar with MultiDeviceKernel and whether it fully supports generic batching etc. But let's figure out the upstream issue first before going there.

Balandat avatar May 04 '20 15:05 Balandat

Hi, I want to ask whether you solve this problem? I want to use multi gpu too!

PMY0124 avatar Sep 25 '22 04:09 PMY0124

Closing in favor of https://github.com/cornellius-gp/gpytorch/issues/1132

esantorella avatar May 05 '23 17:05 esantorella