NaN loss value and black prediction.
Hi, I am coming from medical background and very new to this machine learning field. I am trying to train my U-Net model using keras and tensorflow for image segmentation. However, my loss value is all NaN and the prediction is all black.
I have checked the model summary, and it seems okay for me. Nothing wrong with it. I have also checked the values in the ground-truth segmentation using this command print(np.all(np.isnan(Mask))), and found no problem with the input data.
I got stuck with this problem for almost 2 months and would like to ask for a help from this community on how can I debug my codes or model in solving this problem.
Thanks in advance for your help
These are my codes and also the model summary is as below is toward the end of this issue:
import os
import matplotlib.pyplot as plt
import tensorflow as tf
from keras_preprocessing.image import ImageDataGenerator
from tensorflow import keras
#Define constants
SEED = 909
BATCH_SIZE_TRAIN = 8
BATCH_SIZE_TEST = 8
IMAGE_HEIGHT = 512
IMAGE_WIDTH = 512
IMG_SIZE = (IMAGE_HEIGHT, IMAGE_WIDTH)
data_dir = '/home/data'
data_dir_train = os.path.join(data_dir, 'training')
data_dir_train_image = os.path.join(data_dir_train, 'img')
data_dir_train_mask = os.path.join(data_dir_train, 'mask')
data_dir_test = os.path.join(data_dir, 'test')
data_dir_test_image = os.path.join(data_dir_test, 'img')
data_dir_test_mask = os.path.join(data_dir_test, 'mask')
NUM_TRAIN = 1413
NUM_TEST = 210
NUM_OF_EPOCHS = 10
def create_segmentation_generator_train(img_path, mask_path, BATCH_SIZE):
data_gen_args = dict(rescale=1./255)
img_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(*data_gen_args)
img_generator = img_datagen.flow_from_directory(img_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
mask_generator = mask_datagen.flow_from_directory(mask_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
return zip(img_generator, mask_generator)
def create_segmentation_generator_test(img_path, mask_path, BATCH_SIZE):
data_gen_args = dict(rescale=1./255)
img_datagen = ImageDataGenerator(**data_gen_args)
mask_datagen = ImageDataGenerator(*data_gen_args)
img_generator = img_datagen.flow_from_directory(img_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
mask_generator = mask_datagen.flow_from_directory(mask_path, target_size=IMG_SIZE, class_mode=None, color_mode='grayscale', batch_size=BATCH_SIZE, seed=SEED)
return zip(img_generator, mask_generator)
def display(display_list):
plt.figure(figsize=(15,15))
title = ['Input Image', 'True Mask', 'Predicted Mask']
for i in range(len(display_list)):
plt.subplot(1, len(display_list), i+1)
plt.title(title[i])
plt.imshow(tf.keras.preprocessing.image.array_to_img(display_list[i]), cmap='gray')
plt.show()
def unet(n_levels, initial_features=32, n_blocks=2, kernel_size=3, pooling_size=2, in_channels=1, out_channels=1):
#n_blocks = how many conv in each level
inputs = keras.layers.Input(shape=(IMAGE_HEIGHT, IMAGE_WIDTH, in_channels))
x = inputs
convpars = dict(kernel_size=kernel_size, activation='relu', padding='same')
#downstream
skips = {}
for level in range(n_levels):
for _ in range (n_blocks):
x = keras.layers.Conv2D(initial_features * 2 ** level, **convpars)(x)
if level < n_levels - 1:
skips[level] = x
x = keras.layers.MaxPool2D(pooling_size)(x)
#upstream
for level in reversed(range(n_levels-1)):
x = keras.layers.Conv2DTranspose(initial_features * 2 ** level, strides=pooling_size, **convpars)(x)
x = keras.layers.Concatenate()([x, skips[level]])
for _ in range (n_blocks):
x = keras.layers.Conv2D(initial_features * 2 ** level, **convpars)(x)
#output
activation = 'sigmoid' if out_channels == 1 else 'softmax'
x = keras.layers.Conv2D(out_channels, kernel_size=1, activation='sigmoid', padding='same')(x)
return keras.Model(inputs=[inputs], outputs=[x], name=f'UNET-L{n_levels}-F{initial_features}')
EPOCH_STEP_TRAIN = NUM_TRAIN // BATCH_SIZE_TRAIN
EPOCH_STEP_TEST = NUM_TEST // BATCH_SIZE_TRAIN
model = unet(4)
model.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=0.5), loss='binary_crossentropy', metrics=['accuracy'])
model.fit_generator(generator=train_generator, steps_per_epoch=EPOCH_STEP_TRAIN, validation_data=test_generator, validation_steps=EPOCH_STEP_TEST, epochs=NUM_OF_EPOCHS)
Model: "UNET-L4-F32"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 512, 512, 1) 0
__________________________________________________________________________________________________
conv2d (Conv2D) (None, 512, 512, 32) 320 input_1[0][0]
__________________________________________________________________________________________________
conv2d_1 (Conv2D) (None, 512, 512, 32) 9248 conv2d[0][0]
__________________________________________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 256, 256, 32) 0 conv2d_1[0][0]
__________________________________________________________________________________________________
conv2d_2 (Conv2D) (None, 256, 256, 64) 18496 max_pooling2d[0][0]
__________________________________________________________________________________________________
conv2d_3 (Conv2D) (None, 256, 256, 64) 36928 conv2d_2[0][0]
__________________________________________________________________________________________________
max_pooling2d_1 (MaxPooling2D) (None, 128, 128, 64) 0 conv2d_3[0][0]
__________________________________________________________________________________________________
conv2d_4 (Conv2D) (None, 128, 128, 128 73856 max_pooling2d_1[0][0]
__________________________________________________________________________________________________
conv2d_5 (Conv2D) (None, 128, 128, 128 147584 conv2d_4[0][0]
__________________________________________________________________________________________________
max_pooling2d_2 (MaxPooling2D) (None, 64, 64, 128) 0 conv2d_5[0][0]
__________________________________________________________________________________________________
conv2d_6 (Conv2D) (None, 64, 64, 256) 295168 max_pooling2d_2[0][0]
__________________________________________________________________________________________________
conv2d_7 (Conv2D) (None, 64, 64, 256) 590080 conv2d_6[0][0]
__________________________________________________________________________________________________
conv2d_transpose (Conv2DTranspo (None, 128, 128, 128 295040 conv2d_7[0][0]
__________________________________________________________________________________________________
concatenate (Concatenate) (None, 128, 128, 256 0 conv2d_transpose[0][0]
conv2d_5[0][0]
__________________________________________________________________________________________________
conv2d_8 (Conv2D) (None, 128, 128, 128 295040 concatenate[0][0]
__________________________________________________________________________________________________
conv2d_9 (Conv2D) (None, 128, 128, 128 147584 conv2d_8[0][0]
__________________________________________________________________________________________________
conv2d_transpose_1 (Conv2DTrans (None, 256, 256, 64) 73792 conv2d_9[0][0]
__________________________________________________________________________________________________
concatenate_1 (Concatenate) (None, 256, 256, 128 0 conv2d_transpose_1[0][0]
conv2d_3[0][0]
__________________________________________________________________________________________________
conv2d_10 (Conv2D) (None, 256, 256, 64) 73792 concatenate_1[0][0]
__________________________________________________________________________________________________
conv2d_11 (Conv2D) (None, 256, 256, 64) 36928 conv2d_10[0][0]
__________________________________________________________________________________________________
conv2d_transpose_2 (Conv2DTrans (None, 512, 512, 32) 18464 conv2d_11[0][0]
__________________________________________________________________________________________________
concatenate_2 (Concatenate) (None, 512, 512, 64) 0 conv2d_transpose_2[0][0]
conv2d_1[0][0]
__________________________________________________________________________________________________
conv2d_12 (Conv2D) (None, 512, 512, 32) 18464 concatenate_2[0][0]
__________________________________________________________________________________________________
conv2d_13 (Conv2D) (None, 512, 512, 32) 9248 conv2d_12[0][0]
__________________________________________________________________________________________________
conv2d_14 (Conv2D) (None, 512, 512, 1) 33 conv2d_13[0][0]
==================================================================================================
Total params: 2,140,065
Trainable params: 2,140,065
Non-trainable params: 0
If your predictions are all coming out as zero, and the loss is NaN, chances are you're providing improperly scaled images to the model ([0, 1], [-1, 1], [0, 255], etc.,).
Why are you not using this repo's UNet model?
I also faced the same problem.
- I would suggest you to check the root data path first.
- Checked scaled images. As given by @JordanMakesMaps.
- Use models from the segmentation_models lib.