physicsnemo
physicsnemo copied to clipboard
CorrDiff: Compatibility between the logged wandb loss and loss in training stats
FYI. I accomplished this in my own code like this:
commit 1793b9efb1e20c1995e1c34199aa10f602632e16
Author: Noah D. Brenowitz <[email protected]>
Date: Mon Apr 1 16:12:37 2024 -0700
log collected loss statistics to tensorboard
diff --git a/torch_utils/training_stats.py b/torch_utils/training_stats.py
index 727c4e8..d96b803 100644
--- a/torch_utils/training_stats.py
+++ b/torch_utils/training_stats.py
@@ -14,6 +14,7 @@ import re
import numpy as np
import torch
import dnnlib
+from torch.utils.tensorboard.writer import SummaryWriter
from . import misc
@@ -143,6 +144,17 @@ class Collector:
"""
return [name for name in _counters if self._regex.fullmatch(name)]
+ def log_to_tensorboard(self, writer: SummaryWriter):
+ info = self.as_dict()
+ try:
+ nimg = info["Progress/kimg"]["mean"] * 1000
+ except KeyError:
+ nimg = None
+
+ for k, v in info.items():
+ for moment in v:
+ writer.add_scalar(f"{k}/{moment}", v[moment], global_step=nimg)
+
def update(self):
r"""Copies current values of the internal counters to the
user-visible state and resets them for the next round.
diff --git a/training/training_loop.py b/training/training_loop.py
index bcb2f9d..a668d7a 100644
--- a/training/training_loop.py
+++ b/training/training_loop.py
@@ -253,8 +253,6 @@ def training_loop(
torch.cuda.nvtx.range_push("training_loop:step")
- if dist.get_rank() == 0:
- writer.add_scalar("Loss/loss", total_loss, cur_nimg)
# Update weights.
for g in optimizer.param_groups:
g["lr"] = optimizer_kwargs["lr"] * min(
@@ -390,6 +388,7 @@ def training_loop(
# Update logs.
training_stats.default_collector.update()
if dist.get_rank() == 0:
+ training_stats.default_collector.log_to_tensorboard(writer)
if stats_jsonl is None:
stats_jsonl = open(os.path.join(run_dir, "stats.jsonl"), "at")
stats_jsonl.write(
I used tensorboard instead as the logging API. wandb can be configured to sync tensorboard logs automatically.