issue-tracking
issue-tracking copied to clipboard
python script does not exit process when script is done running
Describe the Bug
I am training a PyTorch model and logging custom experiment metrics. After the code is finished running the script process does not finish. I have to stop the script myself manually and whenever I do that comet ml gives me the error message,
COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...
COMET WARNING: Failed to log run in comet.ml
However, upon looking at my experiments online, it looks like everything has logged successfully. I have tried experiment.end() and the same bug keeps occurring.
Here is the code I am running,
from comet_ml import Experiment
experiment = Experiment(
api_key=XXXXX,
project_name="c-sudo-rm-rf",
workspace="mrbraden56",
)
class Model:
def __init__(self, dataset: MSSNSDDataset, params: dict, model: torch, loss: sisdr_loss, experiment: Experiment):
self.dataset: MSSNSDDataset = dataset
self.params: dict = params
self.model: torch = model
self.train_loader: DataLoader = DataLoader(dataset=dataset,
batch_size=params["batch_size"],
shuffle=False,
drop_last=False)
self.optimizer = torch.optim.Adam(model.parameters(), lr=params["lr"])
self.loss: sisdr_loss= loss
self.experiment: Experiment = experiment
def train(self):
for epoch in range(self.params["epochs"]):
for batch_idx, data in enumerate(self.train_loader):
noisy, clean = data
self.model.zero_grad()
output = self.model(noisy)
loss = self.loss(output, clean)
loss.backward()
self.optimizer.step()
print(f"Epoch: {epoch} Loss: {loss.item()}")
self.experiment.log_metric("train_batch_loss", loss.item())
torch.save(self.model.state_dict(), "/home/braden/Work/Research/RemixIT/c_sudo_rmrf/dnn/saved_models/model.pth")
experiment.log_model("C_Sudo_Rm_Rf++", "/home/braden/Work/Research/RemixIT/c_sudo_rmrf/dnn/saved_models/model.pth")
def normalize_tensor_wav(self, wav_tensor, eps=1e-8, std=None):
mean: Tensor = torch.mean(input=wav_tensor,
dim=-1,
keepdim=True)
if std is None:
std: Tensor = torch.std(input=wav_tensor,
dim=-1,
keepdim=True)
return (wav_tensor - mean) / (std + eps)
def log_audio(self):
audio_path: str = "/home/braden/Work/Research/RemixIT/MS-SNSD/Final/Training/NoisySpeech/noisy1_SNRdb_10.0_clnsp1.wav"
waveform, sr = librosa.load(path=audio_path,
sr=8000)
self.experiment.log_audio(audio_data=waveform,
sample_rate=sr,
file_name="noisy1_SNRdb_10.0_clnsp1")
model_path: str = "/home/braden/Work/Research/RemixIT/c_sudo_rmrf/dnn/saved_models/model.pth"
self.model.load_state_dict(torch.load(model_path))
waveform = torch.tensor(waveform)
waveform = torch.unsqueeze(waveform, 0)
waveform = torch.unsqueeze(waveform, 0)
waveform = self.normalize_tensor_wav(wav_tensor=waveform)
output=self.model(waveform)
output=torch.squeeze(output, 0)
s1, s2 = output[0].detach().numpy(), output[1].detach().numpy()
self.experiment.log_audio(audio_data=s1,
sample_rate=sr,
file_name="Source_1")
self.experiment.log_audio(audio_data=s2,
sample_rate=sr,
file_name="Source_2")
@mrbraden56 Would you be able to share the script you're using to actually start the training run? Also would it be possible to run the script in debug mode. You can do this by setting the following environment variables and rerunning the script
export COMET_LOGGING_FILE=./comet.log
export COMET_LOGGING_FILE_LEVEL=debug
If you could share the comet.log
file with me after the script has finished running it would be very helpful.
@DN6 Here is where I call the training loop and let me get the comet.log file
I also set the environmental variables and ran the script, however I do not know where the log file is.
paths: dict[str] = {"noisy_audio" : "/home/braden/Work/Research/RemixIT/MS-SNSD/NoisySpeech_training",
"clean_audio" : "/home/braden/Work/Research/RemixIT/MS-SNSD/CleanSpeech_training",
"noise" : "/home/braden/Work/Research/RemixIT/MS-SNSD/Noise_training"}
batch_size: int = 4
lr: float = 0.01
step_size: int = 10
gamma: float = .5
epochs: int = 5
params = {
"batch_size": batch_size,
"lr": lr,
"optimizer": "Adam",
"lr_scheduler": "StepLR",
"step_size": step_size,
"gamma": gamma,
"epochs": epochs,
"loss": "sisdr_loss"
}
experiment.log_parameters(params)
model = CausalSuDORMRF(in_audio_channels=1,
out_channels=256,
in_channels=512,
num_blocks=4,
upsampling_depth=5,
enc_kernel_size=21,
enc_num_basis=512,
num_sources=2)
trainer=Model(params=params,
model=model,
loss=sisdr_loss,
experiment=experiment,
paths=paths)
trainer.train()
trainer.log_audio()
The file should be created in the current working directory and will be called comet.log
This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.
This issue was closed because it has been stalled for 5 days with no activity.