🐛[BUG]: Error with Domain Parallelism (DoMINO)

Open kk98kk opened this issue 1 month ago • 2 comments

Version

DoMINO 25.08 (physicsnemo v1.3.0a0)

On which installation method(s) does this occur?

No response

Describe the issue

I encounter the following error when attempting to use Domain Parallelism with the DoMINO architecture. My setup uses python 3.11.14 and torch 2.9.0 (CUDA 12.3) , running in a conda environment .

I would really appreciate any suggestions or insights on how to resolve this issue.

[2025-11-12 12:23:29,074][ComputeStatistics][INFO] - Scaling factors loaded from: /lustre/calc2/domino_prj/DrivAerML/outputs/volume/volume_test/scaling_factors/scaling_factors.pkl
[2025-11-12 12:23:55,238][Train][INFO] - Config summary:
data:
  bounding_box:
    max:
    - 8.5
    - 2.25
    - 3.0
    min:
    - -3.5
    - -2.25
    - -0.32
  bounding_box_surface:
    max:
    - 5.0
    - 1.4
    - 1.4
    min:
    - -1.5
    - -1.4
    - -0.32
  gpu_output: true
  gpu_preprocessing: true
  input_dir: /lustre/calc2/domino_prj/DrivAerML/data/train
  input_dir_val: /lustre/calc2/domino_prj/DrivAerML/data/val
  max_samples_for_statistics: 200
  normalize_coordinates: true
  sample_in_bbox: true
  sampling: true
  scaling_factors: ${project_dir}/scaling_factors/scaling_factors.pkl
  volume_sample_from_disk: false
data_processor:
  cached_dir: /user/cached/drivaer_aws/drivaer_data_fuldl/
  input_dir: /ata/drivaer_aws/drivaer_data_full/
  kind: drivaer_aws
  num_processors: 12
  output_dir: /user/aws_data_all/
  use_cache: false
domain_parallelism:
  domain_size: 2
  shard_grid: true
  shard_points: true
eval:
  checkpoint_name: DoMINO.0.480.pt
  num_points: 1240000
  refine_stl: false
  save_path: ${project_dir}/preds
  scaling_param_path: ${project_dir}/scaling_factors
  test_path: /lustre/calc2/domino_prj/DrivAerML/data/test
exp_tag: files
model:
  activation: gelu
  aggregation_model:
    activation: ${model.activation}
    base_layer: 512
  combine_volume_surface: false
  encode_parameters: false
  geom_points_sample: 300000
  geometry_encoding_type: both
  geometry_local:
    base_layer: 512
    surface_neighbors_in_radius:
    - 32
    - 128
    surface_radii:
    - 0.05
    - 0.25
    volume_neighbors_in_radius:
    - 64
    - 128
    volume_radii:
    - 0.1
    - 0.25
  geometry_rep:
    geo_conv:
      activation: ${model.activation}
      base_neurons: 32
      base_neurons_in: 1
      base_neurons_out: 1
      fourier_features: false
      num_modes: 5
      surface_hops: 1
      surface_neighbors_in_radius:
      - 8
      - 16
      - 128
      surface_radii:
      - 0.01
      - 0.05
      - 1.0
      volume_hops: 1
      volume_neighbors_in_radius:
      - 32
      - 64
      - 128
      - 256
      volume_radii:
      - 0.1
      - 0.5
      - 1.0
      - 2.5
    geo_processor:
      activation: ${model.activation}
      base_filters: 8
      cross_attention: false
      processor_type: conv
      self_attention: false
      surface_sdf_scaling_factor:
      - 0.01
      - 0.02
      - 0.04
      volume_sdf_scaling_factor:
      - 0.04
  integral_loss_scaling_factor: 100
  interp_res:
  - 128
  - 64
  - 64
  local_point_conv:
    activation: ${model.activation}
  loss_function:
    area_weighing_factor: 10000
    loss_type: mse
  model_type: volume
  nn_basis_functions:
    activation: ${model.activation}
    base_layer: 512
    fourier_features: true
    num_modes: 5
  normalization: min_max_scaling
  num_neighbors_surface: 7
  num_neighbors_volume: 10
  parameter_model:
    activation: ${model.activation}
    base_layer: 512
    fourier_features: false
    num_modes: 5
  position_encoder:
    activation: ${model.activation}
    base_neurons: 512
    fourier_features: true
    num_modes: 5
  return_volume_neighbors: false
  solution_calculation_mode: two-loop
  surf_loss_scaling: 5.0
  surface_points_sample: 8192
  surface_sampling_algorithm: area_weighted
  use_sdf_in_basis_func: true
  use_surface_area: true
  use_surface_normals: true
  vol_loss_scaling: 1.0
  volume_points_sample: 8192
output: /lustre/calc2/domino_prj/DrivAerML/outputs/volume/${project.name}/${exp_tag}
project:
  name: volume_test
project_dir: /lustre/calc2/domino_prj/DrivAerML/outputs/volume/${project.name}/
resume_dir: ${output}/models
train:
  add_physics_loss: false
  amp:
    autocast:
      dtype: torch.float16
    clip_grad: true
    enabled: true
    grad_max_norm: 2.0
    scaler:
      _target_: torch.cuda.amp.GradScaler
      enabled: ${..enabled}
  checkpoint_dir: /user/models/
  checkpoint_interval: 1
  dataloader:
    batch_size: 1
    pin_memory: true
    preload_depth: 1
  epochs: 500
  lr_scheduler:
    T_max: ${train.epochs}
    eta_min: 1.0e-06
    gamma: 0.5
    milestones:
    - 50
    - 200
    - 400
    - 500
    - 600
    - 700
    - 800
    - 900
    name: MultiStepLR
  optimizer:
    lr: 0.001
    name: Adam
    weight_decay: 0.0
  sampler:
    drop_last: false
    shuffle: true
val:
  dataloader:
    batch_size: 1
    pin_memory: true
    preload_depth: 1
  sampler:
    drop_last: false
    shuffle: true
variables:
  global_parameters:
    air_density:
      reference: 1.0
      type: scalar
    inlet_velocity:
      reference:
      - 38.889
      type: vector
  surface:
    solution:
      pMeanTrim: scalar
      wallShearStressMeanTrim: vector
  volume:
    solution:
      UMeanTrim: vector
      nutMeanTrim: scalar
      pMeanTrim: scalar

[2025-11-12 12:23:55,365][Train][INFO] - Model summary:
======================================================================
Layer (type:depth-idx)                        Param #
======================================================================
DoMINO                                        --
├─GeometryRep: 1-1                            --
│    └─GELU: 2-1                              --
│    └─ModuleList: 2-2                        --
│    └─ModuleList: 2-3                        266,032
│    └─ModuleList: 2-4                        48,388
│    └─ModuleList: 2-5                        112
│    └─Sequential: 2-6                        71,783
│    └─Conv3d: 2-7                            28
├─GeometryRep: 1-2                            --
│    └─GELU: 2-8                              --
│    └─ModuleList: 2-9                        --
│    └─ModuleList: 2-10                       199,524
│    └─ModuleList: 2-11                       16,323
│    └─ModuleList: 2-12                       84
│    └─Sequential: 2-13                       74,649
│    └─Conv3d: 2-14                           28
├─ModuleList: 1-3                             --
│    └─FourierMLP: 2-15                       542,720
│    └─FourierMLP: 2-16                       542,720
│    └─FourierMLP: 2-17                       542,720
│    └─FourierMLP: 2-18                       542,720
│    └─FourierMLP: 2-19                       542,720
├─GELU: 1-4                                   --
├─FourierMLP: 1-5                             --
│    └─Mlp: 2-20                              570,880
├─MultiGeometryEncoding: 1-6                  --
│    └─ModuleList: 2-21                       410,784
├─MultiGeometryEncoding: 1-7                  --
│    └─ModuleList: 2-22                       591,040
├─ModuleList: 1-8                             --
│    └─AggregationModel: 2-23                 1,411,585
│    └─AggregationModel: 2-24                 1,411,585
│    └─AggregationModel: 2-25                 1,411,585
│    └─AggregationModel: 2-26                 1,411,585
│    └─AggregationModel: 2-27                 1,411,585
├─SolutionCalculatorVolume: 1-9               9,771,525
│    └─ModuleList: 2-28                       (recursive)
│    └─ModuleList: 2-29                       (recursive)
======================================================================
Total params: 21,792,705
Trainable params: 21,792,705
Non-trainable params: 0
======================================================================

[2025-11-12 12:23:55,728][checkpoint][ERROR] - Could not find valid model file /lustre/calc2/domino_prj/DrivAerML/outputs/volume/volume_test/files/models/FSDPDoMINO.0.0.pt, skipping load
[2025-11-12 12:23:55,728][checkpoint][ERROR] - Could not find valid model file /lustre/calc2/domino_prj/DrivAerML/outputs/volume/volume_test/files/models/FSDPDoMINO.0.0.pt, skipping load
[2025-11-12 12:23:55,729][checkpoint][WARNING] - Could not find valid checkpoint file, skipping load
[2025-11-12 12:23:55,729][checkpoint][WARNING] - Could not find valid checkpoint file, skipping load
[2025-11-12 12:23:55,729][Train][INFO] - Device cuda:0, epoch 0:
Error executing job with overrides: []
Error executing job with overrides: []
[rank0]: Traceback (most recent call last):
[rank0]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 714, in <module>
[rank0]:     main()
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/main.py", line 94, in decorated_main
[rank0]:     _run_hydra(
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank0]:     _run_app(
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank0]:     run_and_report(
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank0]:     raise ex
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank0]:     return func()
[rank0]:            ^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
[rank0]:     lambda: hydra.run(
[rank0]:             ^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank0]:     _ = ret.return_value
[rank0]:         ^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/core/utils.py", line 260, in return_value
[rank0]:     raise self._return_value
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/core/utils.py", line 186, in run_job
[rank0]:     ret.return_value = task_function(task_cfg)
[rank0]:                        ^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 618, in main
[rank0]:     avg_loss = train_epoch(
[rank0]:                ^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 223, in train_epoch
[rank0]:     prediction_vol, prediction_surf = model(sampled_batched)
[rank0]:                                       ^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
[rank0]:     return inner()
[rank0]:            ^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1829, in inner
[rank0]:     result = forward_call(*args, **kwargs)
[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/domino/model.py", line 508, in forward
[rank0]:     encoding_g_vol = self.geo_rep_volume(geo_centers_vol, p_grid, sdf_grid)
[rank0]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank0]:     return forward_call(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/domino/geometry_rep.py", line 456, in forward
[rank0]:     mapping, k_short = self.bq_warp[j](x, p_grid)
[rank0]:                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank0]:     return self._call_impl(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank0]:     return forward_call(*args, **kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/layers/ball_query.py", line 95, in forward
[rank0]:     p_grid = rearrange(p_grid, "b nx ny nz c -> b (nx ny nz) c")
[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 600, in rearrange
[rank0]:     return reduce(tensor, pattern, reduction="rearrange", **axes_lengths)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 532, in reduce
[rank0]:     return _apply_recipe(
[rank0]:            ^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 251, in _apply_recipe
[rank0]:     tensor = backend.reshape(tensor, final_shapes)
[rank0]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/_backends.py", line 93, in reshape
[rank0]:     return x.reshape(shape)
[rank0]:            ^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/distributed/shard_tensor.py", line 403, in __torch_function__
[rank0]:     return super().__torch_function__(func, types, args, kwargs)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/distributed/shard_tensor.py", line 433, in __torch_dispatch__
[rank0]:     dispatch_res = DTensor._op_dispatcher.dispatch(func, args, kwargs or {})
[rank0]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/tensor/_dispatch.py", line 329, in dispatch
[rank0]:     return return_and_correct_aliasing(op_call, args, kwargs, ret)
[rank0]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 729, in return_and_correct_aliasing
[rank0]:     _correct_storage_aliasing(
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 579, in _correct_storage_aliasing
[rank0]:     alias_non_inplace_storage(args[arg_idx], outs[return_idx])
[rank0]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 551, in alias_non_inplace_storage
[rank0]:     assert type(arg) == type(
[rank0]:            ^^^^^^^^^^^^^^^^^^
[rank0]: AssertionError: Called aten.view.default with input of type <class 'physicsnemo.distributed.shard_tensor.ShardTensor'>
[rank0]: and output of type <class 'torch.distributed.tensor.DTensor'>. But expected types to match.
[rank1]: Traceback (most recent call last):
[rank1]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 714, in <module>
[rank1]:     main()
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/main.py", line 94, in decorated_main
[rank1]:     _run_hydra(
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 394, in _run_hydra
[rank1]:     _run_app(
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 457, in _run_app
[rank1]:     run_and_report(
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 223, in run_and_report
[rank1]:     raise ex
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 220, in run_and_report
[rank1]:     return func()
[rank1]:            ^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/utils.py", line 458, in <lambda>
[rank1]:     lambda: hydra.run(
[rank1]:             ^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/_internal/hydra.py", line 132, in run
[rank1]:     _ = ret.return_value
[rank1]:         ^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/core/utils.py", line 260, in return_value
[rank1]:     raise self._return_value
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/hydra/core/utils.py", line 186, in run_job
[rank1]:     ret.return_value = task_function(task_cfg)
[rank1]:                        ^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 618, in main
[rank1]:     avg_loss = train_epoch(
[rank1]:                ^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/domino_prj/domino_copy/src/train.py", line 223, in train_epoch
[rank1]:     prediction_vol, prediction_surf = model(sampled_batched)
[rank1]:                                       ^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1881, in _call_impl
[rank1]:     return inner()
[rank1]:            ^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1829, in inner
[rank1]:     result = forward_call(*args, **kwargs)
[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/domino/model.py", line 508, in forward
[rank1]:     encoding_g_vol = self.geo_rep_volume(geo_centers_vol, p_grid, sdf_grid)
[rank1]:                      ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank1]:     return forward_call(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/domino/geometry_rep.py", line 456, in forward
[rank1]:     mapping, k_short = self.bq_warp[j](x, p_grid)
[rank1]:                        ^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_call_impl
[rank1]:     return self._call_impl(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
[rank1]:     return forward_call(*args, **kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/models/layers/ball_query.py", line 95, in forward
[rank1]:     p_grid = rearrange(p_grid, "b nx ny nz c -> b (nx ny nz) c")
[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 600, in rearrange
[rank1]:     return reduce(tensor, pattern, reduction="rearrange", **axes_lengths)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 532, in reduce
[rank1]:     return _apply_recipe(
[rank1]:            ^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/einops.py", line 251, in _apply_recipe
[rank1]:     tensor = backend.reshape(tensor, final_shapes)
[rank1]:              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/einops/_backends.py", line 93, in reshape
[rank1]:     return x.reshape(shape)
[rank1]:            ^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/distributed/shard_tensor.py", line 403, in __torch_function__
[rank1]:     return super().__torch_function__(func, types, args, kwargs)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/gitclone/physicsnemo_28_10/physicsnemo/distributed/shard_tensor.py", line 433, in __torch_dispatch__
[rank1]:     dispatch_res = DTensor._op_dispatcher.dispatch(func, args, kwargs or {})
[rank1]:                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/tensor/_dispatch.py", line 329, in dispatch
[rank1]:     return return_and_correct_aliasing(op_call, args, kwargs, ret)
[rank1]:            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 729, in return_and_correct_aliasing
[rank1]:     _correct_storage_aliasing(
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 579, in _correct_storage_aliasing
[rank1]:     alias_non_inplace_storage(args[arg_idx], outs[return_idx])
[rank1]:   File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/utils/_python_dispatch.py", line 551, in alias_non_inplace_storage
[rank1]:     assert type(arg) == type(
[rank1]:            ^^^^^^^^^^^^^^^^^^
[rank1]: AssertionError: Called aten.view.default with input of type <class 'physicsnemo.distributed.shard_tensor.ShardTensor'>
[rank1]: and output of type <class 'torch.distributed.tensor.DTensor'>. But expected types to match.
W1112 12:24:07.112000 23548 site-packages/torch/distributed/elastic/multiprocessing/api.py:908] Sending process 23581 closing signal SIGTERM
E1112 12:24:07.476000 23548 site-packages/torch/distributed/elastic/multiprocessing/api.py:882] failed (exitcode: 1) local_rank: 0 (pid: 23580) of binary: /lustre/calc2/miniforge3/envs/domino2/bin/python3.11
Traceback (most recent call last):
  File "/lustre/calc2/miniforge3/envs/domino2/bin/torchrun", line 7, in <module>
    sys.exit(main())
             ^^^^^^
  File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 357, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/run.py", line 936, in main
    run(args)
  File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/run.py", line 927, in run
    elastic_launch(
  File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 156, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/lustre/calc2/miniforge3/envs/domino2/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 293, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
============================================================
/lustre/calc2/domino_prj/domino_copy/src/train.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2025-11-12_12:24:07
  host      : ##############
  rank      : 0 (local_rank: 0)
  exitcode  : 1 (pid: 23580)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

Minimum reproducible example

Relevant log output

Environment details

Nov 12 '25 11:11 kk98kk

Hi @kk98kk - I have never seen this error before! It's unusual. ShardTensor, for domain parallelism, is build on DTensor and uses a lot of it's machinery, however it has hooks to prevent this kind of type mismatch. I am surprised it got around them. Looks like we might be able to reproduce it with just this line:

p_grid = rearrange(p_grid, "b nx ny nz c -> b (nx ny nz) c")

Could you give me a little more information? I'd like to know how you launched the training (number of GPUs, domain-parallel size, data-parallel size, extra arguments to the model if there are any) and about the physicsnemo installation - are you installing from pip, using a container, or maybe installing from source?

Finally, DoMINO gives the option to shard the input points, the latent space grid, or both. Can you share that configuration too? This will help me reproduce it.

Nov 13 '25 15:11 coreyjadams

Hi @coreyjadams, for debugging and testing the domain parallelism feature I’m currently using only 2 GPUs. The relevant excerpt from my config.yaml looks like this:

# ┌───────────────────────────────────────────┐
# │          Domain Parallelism Settings      │
# └───────────────────────────────────────────┘  
domain_parallelism:
  domain_size: 2
  shard_grid: true
  shard_points: true

Current setup:

number of GPUs = 2
domain-parallel size = 2
data-parallel size = 0/1 (all batches run on the same pair of GPUs; only the amount of data is split, not the number of batches)
no additional model arguments

and I’m running the training with the following command in my conda environment:

torchrun --standalone --nproc_per_node=2 train.py

Goal: To eventually run this on 4 GPUs:

2 GPUs forming one domain-parallel group (to handle larger grids/data)
and two such groups, to effectively halve the batch size / reduce computation time (data parallelism)

Installation I installed physicsnemo directly from the GitHub repository inside a Conda environment (pip install .). Pure data parallelism works without any issues - splitting batches among on 1, 2, or 4 GPUs works fine.

Behavior of the error The error appears regardless of whether I shard

only the latent space grid,
only the input point grid, or
both.

The error pattern is always the same.

Note on minimal Torch requirements You may want to update your minimal Torch version requirements. Based on my tests:

Torch < 2.5 → package/module names have changed
Torch 2.6 / 2.7 → no error, but domain parallelism does not work
Torch 2.8 / 2.9 → errors occur, but in different locations
Torch 2.9 → the strictest regarding data types

Below is my full config.yaml:

# SPDX-FileCopyrightText: Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES.
# SPDX-FileCopyrightText: All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# ┌───────────────────────────────────────────┐
# │            Project Details                │
# └───────────────────────────────────────────┘  
project: # Project name
  name: DrivAerML_Dataset
  
exp_tag: 1 # Experiment tag
# Main output directory.
project_dir: outputs/${project.name}/
output: outputs/${project.name}/${exp_tag}

hydra: # Hydra config
  run:
    dir: ${output}
  output_subdir: hydra  # Default is .hydra which causes files not being uploaded in W&B.

# The directory to search for checkpoints to continue training.
resume_dir: ${output}/models

# ┌───────────────────────────────────────────┐
# │            Data Preprocessing             │
# └───────────────────────────────────────────┘  
data_processor: # Data processor configurable parameters
  kind: drivaer_aws # must be either drivesim or drivaer_aws
  output_dir: /user/aws_data_all/
  input_dir: /data/drivaer_aws/drivaer_data_full/
  cached_dir: /user/cached/drivaer_aws/drivaer_data_full/
  use_cache: false
  num_processors: 12

# ┌───────────────────────────────────────────┐
# │            Solution variables             │
# └───────────────────────────────────────────┘  
variables:
  surface:
    solution:
      # The following is for AWS DrivAer dataset.
      pMeanTrim: scalar
      wallShearStressMeanTrim: vector
  volume:
    solution:
      # The following is for AWS DrivAer dataset.
      UMeanTrim: vector
      pMeanTrim: scalar
      nutMeanTrim: scalar
  global_parameters:
    inlet_velocity:
      type: vector
      reference: [38.889] # vector [30, 0, 0] should be specified as [30], while [30, 30, 0] should be [30, 30].
    air_density:
      type: scalar
      reference: 1.0

# ┌───────────────────────────────────────────┐
# │         Data Configs                      │
# └───────────────────────────────────────────┘  
data: # Input directory for training and validation data
  input_dir: /user/data/aws_data_all/
  input_dir_val: /user/data/aws_data_all_val/
  bounding_box: # Bounding box dimensions for computational domain
    min: [-3.5, -2.25, -0.32]
    max: [8.5, 2.25, 3.00]
  bounding_box_surface: # Bounding box dimensions for car surface
    min: [-1.5, -1.4, -0.32]
    max: [5.0, 1.4, 1.4]
  gpu_preprocessing: true
  gpu_output: true
  normalize_coordinates: true
  sample_in_bbox: true
  sampling: true
  scaling_factors: ${project_dir}/scaling_factors/scaling_factors.pkl
  volume_sample_from_disk: true
  max_samples_for_statistics: 200

# ┌───────────────────────────────────────────┐
# │          Domain Parallelism Settings      │
# └───────────────────────────────────────────┘  
domain_parallelism:
  domain_size: 2
  shard_grid: true
  shard_points: true

# ┌───────────────────────────────────────────┐
# │          Model Parameters                 │
# └───────────────────────────────────────────┘  
model:
  model_type: volume # train which model? surface, volume, combined
  activation: "gelu" # "relu" or "gelu"
  loss_function: 
    loss_type: "mse" # mse or rmse
    area_weighing_factor: 10000 # Generally inverse of maximum area
  interp_res: [128, 64, 64] # resolution of latent space 128, 64, 48
  use_sdf_in_basis_func: true # SDF in basis function network
  volume_points_sample: 8192 # Number of points to sample in volume per epoch
  surface_points_sample: 8192 # Number of points to sample on surface per epoch
  surface_sampling_algorithm: area_weighted #random or area_weighted
  geom_points_sample: 300_000 # Number of points to sample on STL per epoch
  num_neighbors_surface: 7 # How many neighbors on surface?
  num_neighbors_volume: 10 # How many neighbors on volume?
  combine_volume_surface: false # combine volume and surface encodings
  return_volume_neighbors: false # Whether to return volume neighbors or not
  use_surface_normals: true # Use surface normals and surface areas for surface computation?
  use_surface_area: true # Use only surface normals and not surface area
  integral_loss_scaling_factor: 100 # Scale integral loss by this factor
  normalization: min_max_scaling # or mean_std_scaling
  encode_parameters: false # encode inlet velocity and air density in the model
  surf_loss_scaling: 5.0 # scale surface loss with this factor in combined mode
  vol_loss_scaling: 1.0 # scale volume loss with this factor in combined mode
  geometry_encoding_type: both # geometry encoder type, sdf, stl, both
  solution_calculation_mode: two-loop # one-loop is better for sharded, two-loop is lower memory but more overhead. Physics losses are not supported via one-loop presently.
  geometry_rep: # Hyperparameters for geometry representation network
    geo_conv:
      base_neurons: 32 # 256 or 64
      base_neurons_in: 1
      base_neurons_out: 1
      volume_radii: [0.1, 0.5, 1.0, 2.5] # radii for volume
      surface_radii: [0.01, 0.05, 1.0] # radii for surface
      surface_hops: 1 # Number of surface iterations
      volume_hops: 1 # Number of volume iterations
      volume_neighbors_in_radius: [32, 64, 128, 256] # Number of neighbors in radius for volume
      surface_neighbors_in_radius: [8, 16, 128] # Number of neighbors in radius for surface
      fourier_features: false
      num_modes: 5
      activation: ${model.activation}
    geo_processor:
      base_filters: 8
      activation: ${model.activation}
      processor_type: conv # conv or unet (conv is better; fno, fignet to be added)
      self_attention: false # can be used only with unet
      cross_attention: false # can be used only with unet
      surface_sdf_scaling_factor: [0.01, 0.02, 0.04] # Scaling factor for SDF, smaller is more emphasis on surface
      volume_sdf_scaling_factor: [0.04] # Scaling factor for SDF, smaller is more emphasis on surface
  nn_basis_functions: # Hyperparameters for basis function network
    base_layer: 512
    fourier_features: true
    num_modes: 5
    activation: ${model.activation}
  local_point_conv:
    activation: ${model.activation}
  aggregation_model: # Hyperparameters for aggregation network
    base_layer: 512
    activation: ${model.activation}
  position_encoder: # Hyperparameters for position encoding network
    base_neurons: 512
    activation: ${model.activation}
    fourier_features: true
    num_modes: 5
  geometry_local: # Hyperparameters for local geometry extraction
    volume_neighbors_in_radius: [64, 128] # Number of radius points
    surface_neighbors_in_radius: [32, 128] # Number of radius points
    volume_radii: [0.1, 0.25] # Volume radii
    surface_radii: [0.05, 0.25] # Surface radii
    base_layer: 512
  parameter_model:
    base_layer: 512
    fourier_features: false
    num_modes: 5
    activation: ${model.activation}

# ┌───────────────────────────────────────────┐
# │          Training Configs                 │
# └───────────────────────────────────────────┘  
train: # Training configurable parameters
  epochs: 500
  checkpoint_interval: 1
  dataloader:
    batch_size: 1
    preload_depth: 1
    pin_memory: True # if the preprocessing is outputing GPU data, set this to false
  sampler:
    shuffle: true
    drop_last: false
  checkpoint_dir: /user/models/ # Use only for retraining
  add_physics_loss: false
  lr_scheduler:
    name: MultiStepLR # Also supports CosineAnnealingLR  
    milestones: [50, 200, 400, 500, 600, 700, 800, 900] # only used if lr_scheduler is MultiStepLR
    gamma: 0.5 # only used if lr_scheduler is MultiStepLR
    T_max: ${train.epochs} # only used if lr_scheduler is CosineAnnealingLR
    eta_min: 1e-6 # only used if lr_scheduler is CosineAnnealingLR
  optimizer:
    name: Adam # or AdamW
    lr: 0.001
    weight_decay: 0.0
  amp:
    enabled: true
    autocast:
      dtype: torch.float16
    scaler:
      _target_: torch.cuda.amp.GradScaler
      enabled: ${..enabled}
    clip_grad: true
    grad_max_norm: 2.0


# ┌───────────────────────────────────────────┐
# │          Validation Configs               │
# └───────────────────────────────────────────┘  
val: # Validation configurable parameters
  dataloader:
    batch_size: 1
    preload_depth: 1
    pin_memory: true # if the preprocessing is outputing GPU data, set this to false
  sampler:
    shuffle: true
    drop_last: false

# ┌───────────────────────────────────────────┐
# │          Testing data Configs             │
# └───────────────────────────────────────────┘  
eval: # Testing configurable parameters
  test_path: /user/testing_data # Dir for testing data in raw format (vtp, vtu ,stls)
  save_path: /user/predicted_data # Dir to save predicted results in raw format (vtp, vtu)
  checkpoint_name: DoMINO.0.455.pt # Name of checkpoint to select from saved checkpoints
  scaling_param_path: /user/scaling_params
  refine_stl: False # Automatically refine STL during inference
  #TODO -  This was hardcoded anyways, remove it.
  # stencil_size: 7 # Stencil size for evaluating surface and volume model
  num_points: 1_240_000 # Number of points to sample on surface and volume per batch

Nov 17 '25 11:11 kk98kk