graphstorm
graphstorm copied to clipboard
[Bug] When using batch norm GraphStorm may fail
GraphStorm's edge sampler does not guarantee that for each edge type in training set it will sample more than 1 edges. This will cause an error in when batch-norm is turned on.
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2416, in _verify_batch_size
raise ValueError("Expected more than 1 value per channel when training, got input size {}".format(size))
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 128])
Traceback (most recent call last):
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 228, in <module>
main(gs_args)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 188, in main
trainer.fit(train_loader=dataloader, val_loader=val_dataloader,
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/trainer/lp_trainer.py", line 191, in fit
loss = model(blocks, pos_graph, neg_graph,
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1156, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/parallel/distributed.py", line 1110, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0]) # type: ignore[index]
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/lp_gnn.py", line 110, in forward
encode_embs = self.compute_embed_step(blocks, node_feats, input_nodes)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/gnn.py", line 806, in compute_embed_step
gnn_embs = self.gnn_encoder(blocks, embs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/rgat_encoder.py", line 313, in forward
h = layer(block, h)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/rgat_encoder.py", line 208, in forward
return {ntype : _apply(ntype, h) for ntype, h in hs.items()}
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/rgat_encoder.py", line 208, in <dictcomp>
return {ntype : _apply(ntype, h) for ntype, h in hs.items()}
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/model/rgat_encoder.py", line 188, in _apply
h = self.norm[ntype](h)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py", line 171, in forward
return F.batch_norm(
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2448, in batch_norm
_verify_batch_size(input.size())
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/nn/functional.py", line 2416, in _verify_batch_size
raise ValueError("Expected more than 1 value per channel when training, got input size {}".format(size))
ValueError: Expected more than 1 value per channel when training, got input size torch.Size([1, 128])
Client[15] in group[0] is exiting...
Traceback (most recent call last):
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 228, in <module>
Traceback (most recent call last):
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 228, in <module>
main(gs_args)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 188, in main
main(gs_args)
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/run/gsgnn_lp/gsgnn_lp.py", line 188, in main
trainer.fit(train_loader=dataloader, val_loader=val_dataloader,
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/trainer/lp_trainer.py", line 199, in fit
trainer.fit(train_loader=dataloader, val_loader=val_dataloader,
File "/mnt/efs/fs1/aa/2024/graphstorm/python/graphstorm/trainer/lp_trainer.py", line 199, in fit
loss.backward()
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward
loss.backward()
File "/home/ubuntu/.local/lib/python3.8/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(torch.autograd.backward(