ColossalAI
ColossalAI copied to clipboard
[BUG]: Zero returns fp16 tensors which causes RuntimeError
🐛 Describe the bug
I run the following script and it reports Found dtype Float but expected Half. It turns out that y_hat is of type fp16, but y and the loss are of type fp32, which cause the error.
Maybe ZeroModel should cast the output back to fp32 instead of leaving it to the user?
import colossalai
import colossalai.nn
import colossalai.utils
import colossalai.zero.init_ctx
import colossalai.zero.shard_utils
import torch
import torch.utils.data
from colossalai.core import global_context as colossal_gpc
IN_DIM, OUT_DIM = 4096, 4096
class MyDS(torch.utils.data.Dataset):
def __len__(self):
return 10000
def __getitem__(self, index):
return torch.randn(IN_DIM), torch.randn(OUT_DIM)
class MyModel(torch.nn.Module):
def __init__(self) -> None:
super().__init__()
self.linear = torch.nn.Sequential(
torch.nn.Linear(IN_DIM, 32768),
torch.nn.Linear(32768, OUT_DIM),
)
def forward(self, x):
return self.linear(x)
def main():
config = {
"zero": {
"model_config": {
"shard_strategy": colossalai.zero.shard_utils.BucketTensorShardStrategy(),
"reduce_scatter_bucket_size_mb": 25,
"fp32_reduce_scatter": False,
"tensor_placement_policy": "cuda",
"gradient_predivide_factor": 1.0,
"reuse_fp16_shard": True,
},
"optimizer_config": {
"gpu_margin_mem_ratio": 0.5,
"initial_scale": 2**5,
"min_scale": 1,
"growth_factor": 2,
"backoff_factor": 0.5,
"growth_interval": 1000,
"hysteresis": 2,
"max_scale": 2**32,
},
},
}
colossalai.launch_from_torch(config)
ctx = colossalai.zero.init_ctx.ZeroInitContext(
target_device=colossalai.utils.get_current_device(),
shard_strategy=colossal_gpc.config.zero.model_config.shard_strategy,
shard_param=True,
)
with ctx:
model = MyModel()
ds = MyDS()
loader = colossalai.utils.get_dataloader(ds, batch_size=128)
optim = colossalai.nn.HybridAdam(model.parameters())
criterion = torch.nn.MSELoss()
engine, loader, _, _ = colossalai.initialize(model, optim, criterion, loader)
engine.train()
for i, (x, y) in enumerate(loader):
x, y = x.cuda(), y.cuda()
y_hat = engine(x)
# y_hat = y_hat.float()
print(f"{y_hat.dtype}, {y.dtype}")
loss = engine.criterion(y_hat, y)
print(f"Iteration {i}, loss: {loss.item()}.")
engine.backward(loss)
engine.step()
engine.zero_grad()
if __name__ == "__main__":
main()
Environment
colossalai version 0.1.5+torch1.11cu10.2
For some loss fuction, like cross entropy loss, fp16 output is OK. Casting output back to fp32 may increase the memory usage during backward, and loss less precision. You can do this by yourself now, and we are trying to figure out whether casting output back to fp32 is a good idea.
I understand that fp32 would increase memory footprint, but I don't understand why it would be less precise.
I understand that fp32 would increase memory footprint, but I don't understand why it would be less precise.
loss less precision, I mean, more precise.
We have updated a lot. This issue was closed due to inactivity. Thanks.