SLEAP training not proceeding
Bug description
Hello, I've been attempting to run a new project with SLEAP and I'm running into an issue where the training is failing to start. I receive this error:
I've attempted a fresh install to no avail.
It was working earlier when I started my initial training utilizing only 50 frames labeled. However, Once I added an addition 300 frames It no longer runs. Below is the full command line terminal.
I've also attempted changing the input scaling. Im not sure exactly what I've done wrong here, as I've previously run different training sets with many MORE videos of the same size and with many more labels. Any assistance would be appreciated! Thanks!
C:\Windows\System32>conda activate sleap
(sleap) C:\Windows\System32>sleap-label Saving config: C:\Users\Cscho/.sleap/1.4.1a2/preferences.yaml Restoring GUI state...
Software versions: SLEAP: 1.4.1a2 TensorFlow: 2.7.0 Numpy: 1.21.6 Python: 3.7.12 OS: Windows-10-10.0.26100-SP0
Happy SLEAPing! :)
Resetting monitor window.
Polling: G:/Inscopix_projects/Sleap_\models\250429_230757.centroid.n=349\viz\validation.*.png
Start training centroid...
['sleap-train', 'C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json', 'G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp', '--zmq', '--controller_port', '9000', '--publish_port', '9001', '--save_viz']
INFO:sleap.nn.training:Versions:
SLEAP: 1.4.1a2
TensorFlow: 2.7.0
Numpy: 1.21.6
Python: 3.7.12
OS: Windows-10-10.0.26100-SP0
INFO:sleap.nn.training:Training labels file: G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp
INFO:sleap.nn.training:Training profile: C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json
INFO:sleap.nn.training:
INFO:sleap.nn.training:Arguments:
INFO:sleap.nn.training:{
"training_job_path": "C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json",
"labels_path": "G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp",
"video_paths": [
""
],
"val_labels": null,
"test_labels": null,
"base_checkpoint": null,
"tensorboard": false,
"save_viz": true,
"zmq": true,
"publish_port": 9001,
"controller_port": 9000,
"run_name": "",
"prefix": "",
"suffix": "",
"cpu": false,
"first_gpu": false,
"last_gpu": false,
"gpu": "auto"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Training job:
INFO:sleap.nn.training:{
"data": {
"labels": {
"training_labels": null,
"validation_labels": null,
"validation_fraction": 0.1,
"test_labels": null,
"split_by_inds": false,
"training_inds": null,
"validation_inds": null,
"test_inds": null,
"search_path_hints": [],
"skeletons": []
},
"preprocessing": {
"ensure_rgb": false,
"ensure_grayscale": false,
"imagenet_mode": null,
"input_scaling": 0.25,
"pad_to_stride": null,
"resize_and_pad_to_target": true,
"target_height": null,
"target_width": null
},
"instance_cropping": {
"center_on_part": "Center",
"crop_size": null,
"crop_size_detection_padding": 16
}
},
"model": {
"backbone": {
"leap": null,
"unet": {
"stem_stride": null,
"max_stride": 16,
"output_stride": 2,
"filters": 16,
"filters_rate": 2.0,
"middle_block": true,
"up_interpolate": true,
"stacks": 1
},
"hourglass": null,
"resnet": null,
"pretrained_encoder": null
},
"heads": {
"single_instance": null,
"centroid": {
"anchor_part": "Center",
"sigma": 2.5,
"output_stride": 2,
"loss_weight": 1.0,
"offset_refinement": false
},
"centered_instance": null,
"multi_instance": null,
"multi_class_bottomup": null,
"multi_class_topdown": null
},
"base_checkpoint": null
},
"optimization": {
"preload_data": true,
"augmentation_config": {
"rotate": true,
"rotation_min_angle": -180.0,
"rotation_max_angle": 180.0,
"translate": false,
"translate_min": -5,
"translate_max": 5,
"scale": false,
"scale_min": 0.9,
"scale_max": 1.1,
"uniform_noise": false,
"uniform_noise_min_val": 0.0,
"uniform_noise_max_val": 10.0,
"gaussian_noise": false,
"gaussian_noise_mean": 5.0,
"gaussian_noise_stddev": 1.0,
"contrast": false,
"contrast_min_gamma": 0.5,
"contrast_max_gamma": 2.0,
"brightness": false,
"brightness_min_val": 0.0,
"brightness_max_val": 10.0,
"random_crop": false,
"random_crop_height": 256,
"random_crop_width": 256,
"random_flip": true,
"flip_horizontal": false
},
"online_shuffling": true,
"shuffle_buffer_size": 128,
"prefetch": true,
"batch_size": 4,
"batches_per_epoch": null,
"min_batches_per_epoch": 200,
"val_batches_per_epoch": null,
"min_val_batches_per_epoch": 10,
"epochs": 200,
"optimizer": "adam",
"initial_learning_rate": 0.0001,
"learning_rate_schedule": {
"reduce_on_plateau": true,
"reduction_factor": 0.5,
"plateau_min_delta": 1e-06,
"plateau_patience": 5,
"plateau_cooldown": 3,
"min_learning_rate": 1e-08
},
"hard_keypoint_mining": {
"online_mining": false,
"hard_to_easy_ratio": 2.0,
"min_hard_keypoints": 2,
"max_hard_keypoints": null,
"loss_scale": 5.0
},
"early_stopping": {
"stop_training_on_plateau": true,
"plateau_min_delta": 1e-08,
"plateau_patience": 20
}
},
"outputs": {
"save_outputs": true,
"run_name": "250429_230757.centroid.n=349",
"run_name_prefix": "",
"run_name_suffix": "",
"runs_folder": "G:/Inscopix_projects/Sleap_\models",
"tags": [
""
],
"save_visualizations": true,
"delete_viz_images": true,
"zip_outputs": false,
"log_to_csv": true,
"checkpointing": {
"initial_model": false,
"best_model": true,
"every_epoch": false,
"latest_model": false,
"final_model": false
},
"tensorboard": {
"write_logs": false,
"loss_frequency": "epoch",
"architecture_graph": false,
"profile_graph": false,
"visualizations": true
},
"zmq": {
"subscribe_to_controller": true,
"controller_address": "tcp://127.0.0.1:9000",
"controller_polling_timeout": 10,
"publish_updates": true,
"publish_address": "tcp://127.0.0.1:9001"
}
},
"name": "",
"description": "",
"sleap_version": "1.4.1a2",
"filename": "C:\Users\Cscho\AppData\Local\Temp\tmpx26fwmq8\250429_230757_training_job.json"
}
INFO:sleap.nn.training:
INFO:sleap.nn.training:Auto-selected GPU 0 with 6719 MiB of free memory.
INFO:sleap.nn.training:Using GPU 0 for acceleration.
INFO:sleap.nn.training:Disabled GPU memory pre-allocation.
INFO:sleap.nn.training:System:
GPUs: 1/1 available
Device: /physical_device:GPU:0
Available: True
Initialized: False
Memory growth: True
INFO:sleap.nn.training:
INFO:sleap.nn.training:Initializing trainer...
INFO:sleap.nn.training:Loading training labels from: G:/Inscopix_projects/Sleap_/Clean_cerebellum_training.slp
INFO:sleap.nn.training:Creating training and validation splits from validation fraction: 0.1
INFO:sleap.nn.training: Splits: Training = 314 / Validation = 35.
INFO:sleap.nn.training:Setting up for training...
INFO:sleap.nn.training:Setting up pipeline builders...
INFO:sleap.nn.training:Setting up model...
INFO:sleap.nn.training:Building test pipeline...
2025-04-29 23:08:05.342239: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations: AVX AVX2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-04-29 23:08:05.780882: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5446 MB memory: -> device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6
INFO:sleap.nn.training:Loaded test example. [2.909s]
INFO:sleap.nn.training: Input shape: (272, 480, 3)
INFO:sleap.nn.training:Created Keras model.
INFO:sleap.nn.training: Backbone: UNet(stacks=1, filters=16, filters_rate=2.0, kernel_size=3, stem_kernel_size=7, convs_per_block=2, stem_blocks=0, down_blocks=4, middle_block=True, up_blocks=3, up_interpolate=True, block_contraction=False)
INFO:sleap.nn.training: Max stride: 16
INFO:sleap.nn.training: Parameters: 1,953,393
INFO:sleap.nn.training: Heads:
INFO:sleap.nn.training: [0] = CentroidConfmapsHead(anchor_part='Center', sigma=2.5, output_stride=2, loss_weight=1.0)
INFO:sleap.nn.training: Outputs:
INFO:sleap.nn.training: [0] = KerasTensor(type_spec=TensorSpec(shape=(None, 136, 240, 1), dtype=tf.float32, name=None), name='CentroidConfmapsHead/BiasAdd:0', description="created by layer 'CentroidConfmapsHead'")
INFO:sleap.nn.training:Training from scratch
INFO:sleap.nn.training:Setting up data pipelines...
INFO:sleap.nn.training:Training set: n = 314
INFO:sleap.nn.training:Validation set: n = 35
INFO:sleap.nn.training:Setting up optimization...
INFO:sleap.nn.training: Learning rate schedule: LearningRateScheduleConfig(reduce_on_plateau=True, reduction_factor=0.5, plateau_min_delta=1e-06, plateau_patience=5, plateau_cooldown=3, min_learning_rate=1e-08)
INFO:sleap.nn.training: Early stopping: EarlyStoppingConfig(stop_training_on_plateau=True, plateau_min_delta=1e-08, plateau_patience=20)
INFO:sleap.nn.training:Setting up outputs...
INFO:sleap.nn.callbacks:Training controller subscribed to: tcp://127.0.0.1:9000 (topic: )
INFO:sleap.nn.training: ZMQ controller subcribed to: tcp://127.0.0.1:9000
INFO:sleap.nn.callbacks:Progress reporter publishing on: tcp://127.0.0.1:9001 for: not_set
INFO:sleap.nn.training: ZMQ progress reporter publish on: tcp://127.0.0.1:9001
INFO:sleap.nn.training:Created run path: G:/Inscopix_projects/Sleap_\models\250429_230757.centroid.n=349
INFO:sleap.nn.training:Setting up visualization...
INFO:sleap.nn.training:Finished trainer set up. [7.0s]
INFO:sleap.nn.training:Creating tf.data.Datasets for training data generation...
Traceback (most recent call last):
File "C:\Users\Cscho\miniconda3\envs\sleap\Scripts\sleap-train-script.py", line 33, in dataset.cache().take(k).repeat(). You should use dataset.take(k).cache().repeat() instead.
INFO:sleap.nn.callbacks:Closing the reporter controller/context.
INFO:sleap.nn.callbacks:Closing the training controller socket/context.
Run Path: G:/Inscopix_projects/Sleap\models\250429_230757.centroid.n=349
Hey @CodyScholtens !
From your screenshot, it looks like you're trying to load a checkpoint from a model that was trained on grayscale (single-channel) images. However, your new labeled frames appear to come from a 3-channel (RGB) video. This mismatch causes an error because a model trained on single-channel inputs can't be directly used with 3-channel data. If your new data is indeed RGB ((you could check this in the videos tab in GUI), the easiest workaround would be to train a new model from scratch.
Let me know if you have any questions!
Thanks,
Divya
I had the same problem. It looks like the cause was having some of the videos in grayscale mode and some of them in RGB. Solved by clicking on "Toggle Grayscale" in the Videos tab.