WHY IS BINARY SEGMENTATION SO DIFFICULT?
Hi
Im trying to do some binary segmentation and am receiving values that I should on validation and test data. After 3 iterations it says that the IOU is perfect. when it shouldn't be. its just black and white masks with shape (512,512). Im following this git repo for mae_unet https://github.com/implus/mae_segmentation.
I cant figure out how im supposed to make it work.
Test data:
+----------------------+-------+-------+
| Class | IoU | Acc |
+----------------------+-------+-------+
| obj of Interest | 100.0 | 100.0 |
| Background | 0.0 | nan |
+----------------------+-------+-------+
Summary:
+--------+------+-------+-------+
| Scope | mIoU | mAcc | aAcc |
+--------+------+-------+-------+
| global | 50.0 | 100.0 | 100.0 |
+--------+------+-------+-------+
validation:
+----------------------+-------+-------+
| Class | IoU | Acc |
+----------------------+-------+-------+
| obj of Interest | 100.0 | 100.0 |
| Background | 0.0 | nan |
+----------------------+-------+-------+
Summary:
+--------+------+-------+-------+
| Scope | mIoU | mAcc | aAcc |
+--------+------+-------+-------+
| global | 50.0 | 100.0 | 100.0 |
+--------+------+-------+-------+
Custom Dataloader:
from .custom import CustomDataset
from .builder import DATASETS
import os.path as osp
@DATASETS.register_module()
class CustomBinarySegDataset(CustomDataset):
CLASSES = ('Particle of Interest', 'Background', )
PALETTE = [ [1],[0],]
def __init__(self, **kwargs):
super(CustomBinarySegDataset, self).__init__(
img_suffix='.png',
seg_map_suffix='_mask.png',
reduce_zero_label=False,
**kwargs
)
assert osp.exists(self.img_dir)
config:
norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='/vast/home/mayolo/mae_git/mae/output_dir/checkpoint-799.pth',
backbone=dict(
type='MAE',
patch_size=16,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4,
qkv_bias=True,
use_abs_pos_emb=True,
use_rel_pos_bias=True,
img_size=512,
init_values=1.0,
drop_path_rate=0.1,
out_indices=[3, 5, 7, 11]),
decode_head=dict(
type='UPerHead',
in_channels=[768, 768, 768, 768],
in_index=[0, 1, 2, 3],
pool_scales=(1, 2, 3, 6),
channels=768,
dropout_ratio=0.1,
num_classes=2,
norm_cfg=dict(type='SyncBN', requires_grad=True),
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
auxiliary_head=dict(
type='FCNHead',
in_channels=768,
in_index=2,
channels=256,
num_convs=1,
concat_input=False,
dropout_ratio=0.1,
num_classes=2,
norm_cfg=dict(type='SyncBN', requires_grad=True),
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
train_cfg=dict(),
test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
dataset_type = 'CustomBinarySegDataset'
data_root = '/vast/home/mayolo/512x512_Seg_Aug_images/base'
img_norm_cfg = dict(mean=[0, 0, 0], std=[254, 254, 254], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(reduce_zero_label=False, type='LoadAnnotations'),
dict(type='ConvertToGrayScaleMask'),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=1),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='ConvertToGrayScaleMask'),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]
data = dict(
samples_per_gpu=2,
workers_per_gpu=2,
train=dict(
type='CustomBinarySegDataset',
data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/train',
ann_dir=
'/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/train',
pipeline=[
dict(type='LoadImageFromFile'),
dict(reduce_zero_label=False, type='LoadAnnotations'),
dict(type='ConvertToGrayScaleMask'),
dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=1),
dict(type='RandomFlip', prob=0.5),
dict(type='PhotoMetricDistortion'),
dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
dict(type='DefaultFormatBundle'),
dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]),
val=dict(
type='CustomBinarySegDataset',
data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/val',
ann_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/val',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='ConvertToGrayScaleMask'),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]),
test=dict(
type='CustomBinarySegDataset',
data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/val',
ann_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/val',
pipeline=[
dict(type='LoadImageFromFile'),
dict(
type='MultiScaleFlipAug',
img_scale=(2048, 512),
flip=False,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='ConvertToGrayScaleMask'),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img'])
])
]))
log_config = dict(
interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(
type='AdamW',
lr=0.0001,
betas=(0.9, 0.999),
weight_decay=0.01,
constructor='LayerDecayOptimizerConstructor',
paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.65))
optimizer_config = dict(
type='DistOptimizerHook',
update_interval=1,
grad_clip=None,
coalesce=True,
bucket_size_mb=-1,
use_fp16=True)
lr_config = dict(
policy='poly',
warmup='linear',
warmup_iters=1500,
warmup_ratio=1e-06,
power=1.0,
min_lr=0.0,
by_epoch=False)
max_iters = 200000
runner = dict(type='IterBasedRunnerAmp', max_iters=200000)
checkpoint_config = dict(by_epoch=False, interval=2000)
evaluation = dict(interval=100, metric='mIoU')
fp16 = None
work_dir = './work_dirs/upernet_mae_base_12_512_slide_160k_ade20k'
gpu_ids = range(0, 1)
greyscale transform acts as a normalization function also:
@PIPELINES.register_module()
class ConvertToGrayScaleMask(object):
"""Converts a segmentation map to a binary mask rather than the color coded one."""
def __init__(self):
pass
def __call__(self, results):
"""Call function to convert seg map to binary mask."""
for key in results.get("seg_fields", []):
if len(results[key].shape) == 3 and results[key].shape[2] == 3:
results[key] = cv2.cvtColor(results[key], cv2.COLOR_BGR2GRAY)
results[key] = cv2.normalize(results[key], None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
results[key] = np.where(results[key] > 0, 1, 0).astype(np.float32)#allows for test.py to work
return results
def __repr__(self):
return self.__class__.__name__
try to use smaller learning rate and warmup
the main thing you can do is introduce dice loss along with IoU metric and then optimize your model based on results of both the losses.