torch Gradients in optimizer's param_groups are undefined when using mps device

library(torch)

f = function(device) {
  nn_spiral_net <- nn_module("nn_spiral_net",
    initialize = function() {
      self$fc <- nn_linear(2, 1)
    },

    forward = function(x) {
      self$fc(x)
    }
  )
  # Create model instance
  model <- nn_spiral_net()

  x = torch_randn(1, 2)$to(device = device)
  y = torch_tensor(1L)$to(device = device)

  optimizer = optim_sgd(model$parameters, lr = 0.01)

  # Move model to device
  model$to(device = device)

  output <- model(x)
  loss <- nnf_cross_entropy(output, y)

  # Backward pass
  optimizer$zero_grad()
  loss$backward()

  list(
    model$parameters$fc.weight$grad,
    optimizer$param_groups[[1]]$params$fc.weight$grad
  )
}

f("cpu")
#> [[1]]
#> torch_tensor
#>  0  0
#> [ CPUFloatType{1,2} ]
#> 
#> [[2]]
#> torch_tensor
#>  0  0
#> [ CPUFloatType{1,2} ]
f("mps")
#> [[1]]
#> torch_tensor
#>  0  0
#> [ MPSFloatType{1,2} ]
#> 
#> [[2]]
#> [ Tensor (undefined) ] <------------ HERE IT IS UNDEFINED

^{Created on 2024-11-05 with reprex v2.1.1}

session info:

> sessionInfo()
R version 4.4.1 (2024-06-14)
Platform: aarch64-apple-darwin20
Running under: macOS Sonoma 14.1.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/l
ibRblas.0.dylib 
LAPACK: /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/lib/l
ibRlapack.dylib;  LAPACK version 3.12.0

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

time zone: Europe/Berlin
tzcode source: internal

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] mlbench_2.1-5   torch_0.13.0    nvimcom_0.9-159

loaded via a namespace (and not attached):
 [1] vctrs_0.6.5       cli_3.6.3         knitr_1.48       
 [4] rlang_1.1.4       xfun_0.47.1       processx_3.8.4   
 [7] coro_1.0.5        glue_1.8.0        bit_4.5.0        
[10] clipr_0.8.0       htmltools_0.5.8.1 ps_1.8.1         
[13] fansi_1.0.6       rmarkdown_2.28    tibble_3.2.1     
[16] evaluate_0.24.0   fastmap_1.2.0     yaml_2.3.10      
[19] lifecycle_1.0.4   compiler_4.4.1    fs_1.6.4         
[22] pkgconfig_2.0.3   Rcpp_1.0.13       rstudioapi_0.16.0
[25] digest_0.6.37     R6_2.5.1          utf8_1.2.4       
[28] reprex_2.1.1      pillar_1.9.0      callr_3.7.6      
[31] magrittr_2.0.3    tools_4.4.1       withr_3.0.1      
[34] bit64_4.5.2      
>

Nov 05 '24 13:11 sebffischer

Hi @sebffischer,

The problem seems to be that you change the device after passing the parameters to the optimizer. Somehow a copy of the tensors is created in the optimizer when you move the parameters to other devices (cuda has the same problem, I tested it) and the computation graph is severed? In PyTorch it works...so definitely a bug?

For now, change the device before passing the parameters to the optimizer:

f = function(device) {
  nn_spiral_net <- nn_module("nn_spiral_net",
                             initialize = function() {
                               self$fc <- nn_linear(2, 1)
                             },
                             
                             forward = function(x) {
                               self$fc(x)
                             }
  )
  # Create model instance
  model <- nn_spiral_net()
  
  x = torch_randn(1, 2)$to(device = device)
  y = torch_tensor(1L)$to(device = device)
  
  model$to(device = device)
  optimizer = optim_sgd(model$parameters, lr = 0.01)
  
  output <- model(x)
  loss <- nnf_cross_entropy(output, y)
  
  # Backward pass
  optimizer$zero_grad()
  loss$backward()
  
  list(
    model$parameters$fc.weight$grad,
    optimizer$param_groups[[1]]$params$fc.weight$grad
  )
}
> f("mps:0")
[[1]]
torch_tensor
 0  0
[ MPSFloatType{1,2} ]

[[2]]
torch_tensor
 0  0
[ MPSFloatType{1,2} ]

Nov 13 '24 19:11 MaximilianPi

Thanks for looking into this! But tensors have reference semantics so I think calling model$to(device) afterwards should actually be okay (unless there is a misunderstanding from my side).

Nov 14 '24 12:11 sebffischer

Yes, exactly! This is why I think that a copy is unintentionally created instead (by the $to() method), check:

f = function(device) {
    nn_spiral_net <- nn_module("nn_spiral_net",
                               initialize = function() {
                                   self$fc <- nn_linear(2, 1)
                               },
                               
                               forward = function(x) {
                                   self$fc(x)
                               }
    )
    # Create model instance
    model <- nn_spiral_net()
    
    x = torch_randn(1, 2)$to(device = device)
    y = torch_tensor(1L)$to(device = device)
    

    optimizer = optim_sgd(model$parameters, lr = 0.01)
    
    model$to(device = device)
    print(model$parameters$fc.weight$device)
    
    print(optimizer$param_groups[[1]]$params$fc.weight$device)
    
    output <- model(x)
    loss <- nnf_cross_entropy(output, y)
    
    # Backward pass
    optimizer$zero_grad()
    loss$backward()
    
    list(
        model$parameters$fc.weight$grad,
        optimizer$param_groups[[1]]$params$fc.weight$grad
    )
}
torch_device(type='mps', index=0) 
torch_device(type='cpu')

The tensors are on different devices, so they are actually different...

Nov 14 '24 13:11 MaximilianPi