automl icon indicating copy to clipboard operation
automl copied to clipboard

Training with freeze the backbone and then without freeze any layers

Open ChulanZhang opened this issue 4 years ago • 0 comments

Hi, I am training EfficientDet on a custom dataset. I started with freeze the backbone part. ('var_freeze_expr: '(efficientnet)'' in the hyperparameter file) And after 50 epochs, the mAP has achieved a good value. I would like to do further fine-tuning from here. So, I remove the freeze option in the hyperparameter file. And then it throws out this error to me: ''' 2021-06-25 17:16:07.760335: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 I0625 17:16:09.326131 139906640938816 main.py:264] {'name': 'efficientdet-d3', 'act_type': 'swish', 'image_size': (896, 896), 'target_size': None, 'input_rand_hflip': True, 'jitter_min': 0.1, 'jitter_max': 2.0, 'autoaugment_policy': None, 'grid_mask': False, 'sample_image': None, 'map_freq': 5, 'num_classes': 31, 'seg_num_classes': 3, 'heads': ['object_detection'], 'skip_crowd_during_training': True, 'label_map': {1: 'airplane', 2: 'antelope', 3: 'bear', 4: 'bicycle', 5: 'bird', 6: 'bus', 7: 'car', 8: 'cattle', 9: 'dog', 10: 'demestic_cat', 11: 'elephant', 12: 'fox', 13: 'giant_panda', 14: 'hamster', 15: 'horse', 16: 'lion', 17: 'lizard', 18: 'monkey', 19: 'motorcycle', 20: 'rabbit', 21: 'red_panda', 22: 'sheep', 23: 'snake', 24: 'squirrel', 25: 'tiger', 26: 'train', 27: 'turtle', 28: 'watercraft', 29: 'whale', 30: 'zebra'}, 'max_instances_per_image': 100, 'regenerate_source_id': False, 'min_level': 3, 'max_level': 7, 'num_scales': 3, 'aspect_ratios': [1.0, 2.0, 0.5], 'anchor_scale': 4.0, 'is_training_bn': True, 'momentum': 0.9, 'optimizer': 'sgd', 'learning_rate': 0.008, 'lr_warmup_init': 0.008, 'lr_warmup_epoch': 1.0, 'first_lr_drop_epoch': 200.0, 'second_lr_drop_epoch': 250.0, 'poly_lr_power': 0.9, 'clip_gradients_norm': 10.0, 'num_epochs': 100, 'data_format': 'channels_last', 'mean_rgb': [123.675, 116.28, 103.53], 'stddev_rgb': [58.395, 57.120000000000005, 57.375], 'label_smoothing': 0.0, 'alpha': 0.25, 'gamma': 1.5, 'delta': 0.1, 'box_loss_weight': 50.0, 'iou_loss_type': None, 'iou_loss_weight': 1.0, 'weight_decay': 4e-05, 'strategy': None, 'mixed_precision': False, 'loss_scale': None, 'model_optimizations': {}, 'box_class_repeats': 4, 'fpn_cell_repeats': 6, 'fpn_num_filters': 160, 'separable_conv': True, 'apply_bn_for_resampling': True, 'conv_after_downsample': False, 'conv_bn_act_pattern': False, 'drop_remainder': True, 'nms_configs': {'method': 'gaussian', 'iou_thresh': None, 'score_thresh': 0.0, 'sigma': None, 'pyfunc': False, 'max_nms_inputs': 0, 'max_output_size': 100}, 'tflite_max_detections': 100, 'fpn_name': None, 'fpn_weight_method': None, 'fpn_config': None, 'survival_prob': None, 'img_summary_steps': None, 'lr_decay_method': 'cosine', 'moving_average_decay': 0.9998, 'ckpt_var_scope': None, 'skip_mismatch': True, 'backbone_name': 'efficientnet-b3', 'backbone_config': None, 'var_freeze_expr': None, 'use_keras_model': True, 'dataset_type': None, 'positives_momentum': None, 'grad_checkpoint': False, 'model_name': 'efficientdet-d3', 'iterations_per_loop': 100, 'model_dir': 'model_dir/efficientdet-d3-finetune-01', 'num_shards': 8, 'num_examples_per_epoch': 11768, 'backbone_ckpt': '', 'ckpt': 'efficientdet-d3', 'val_json_file': None, 'testdev_dir': None, 'profile': False, 'mode': 'train'} INFO:tensorflow:Using config: {'_model_dir': 'model_dir/efficientdet-d3-finetune-01', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1} I0625 17:16:09.360116 139906640938816 estimator.py:191] Using config: {'_model_dir': 'model_dir/efficientdet-d3-finetune-01', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1} INFO:tensorflow:Using config: {'_model_dir': 'model_dir/efficientdet-d3-finetune-01', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1} I0625 17:16:09.360814 139906640938816 estimator.py:191] Using config: {'_model_dir': 'model_dir/efficientdet-d3-finetune-01', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 2000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true , '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1} WARNING:tensorflow:From /home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version. Instructions for updating: Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts. W0625 17:16:09.373391 139906640938816 deprecation.py:339] From /home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/training_util.py:236: Variable.initialized_value (from tensorflow.python.ops.variables) is deprecated and will be removed in a future version. Instructions for updating: Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts. 2021-06-25 17:16:09.379343: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set 2021-06-25 17:16:09.380486: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1 2021-06-25 17:16:09.424504: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:16:09.425526: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: pciBusID: 0000:00:1e.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-06-25 17:16:09.425573: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 2021-06-25 17:16:09.429061: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11 2021-06-25 17:16:09.429138: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11 2021-06-25 17:16:09.431488: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10 2021-06-25 17:16:09.431901: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10 2021-06-25 17:16:09.434493: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10 2021-06-25 17:16:09.435395: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11 2021-06-25 17:16:09.435634: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8 2021-06-25 17:16:09.435811: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:16:09.436864: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:16:09.437816: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0 I0625 17:16:09.698347 139906640938816 dataloader.py:85] target_size = (896, 896), output_size = (896, 896) INFO:tensorflow:Calling model_fn. I0625 17:16:10.176553 139906640938816 estimator.py:1162] Calling model_fn. I0625 17:16:10.181638 139906640938816 efficientnet_builder.py:215] global_params= GlobalParams(batch_norm_momentum=0.99, batch_norm_epsilon=0.001, dropout_rate=0.3, data_format='channels_last', num_classes=1000, width_coefficient=1.2, depth_coefficient=1.4, depth_divisor=8, min_depth=None, survival_prob=0.8, relu_fn=functools.partial(<function activation_fn at 0x7f3e01910830>, act_type='swish'), batch_norm=<class 'utils.BatchNormalization'>, use_se=True, local_pooling=None, condconv_num_experts=None, clip_projection_output=False, blocks_args=['r1_k3_s11_e1_i32_o16_se0.25', 'r2_k3_s22_e6_i16_o24_se0.25', 'r2_k5_s22_e6_i24_o40_se0.25', 'r3_k3_s22_e6_i40_o80_se0.25', 'r3_k5_s11_e6_i80_o112_se0.25', 'r4_k5_s22_e6_i112_o192_se0.25', 'r1_k3_s11_e6_i192_o320_se0.25'], fix_head_stem=None, grad_checkpoint=False) I0625 17:16:10.669727 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.670932 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.671916 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.672888 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.673874 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.674889 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.675878 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.676877 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.678415 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.679430 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.680409 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.681375 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.682352 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.683428 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.684415 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.685389 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.686903 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.687863 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.688851 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.689883 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.690848 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.691906 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.692902 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.693883 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.695932 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.696901 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.697885 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.698887 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.699865 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.700859 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.701852 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.702919 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.704428 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.705388 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.706364 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.707381 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.708367 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.709405 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.710373 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.711400 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.712884 139906640938816 efficientdet_keras.py:760] fnode 0 : {'feat_level': 6, 'inputs_offsets': [3, 4]} I0625 17:16:10.713846 139906640938816 efficientdet_keras.py:760] fnode 1 : {'feat_level': 5, 'inputs_offsets': [2, 5]} I0625 17:16:10.714908 139906640938816 efficientdet_keras.py:760] fnode 2 : {'feat_level': 4, 'inputs_offsets': [1, 6]} I0625 17:16:10.715889 139906640938816 efficientdet_keras.py:760] fnode 3 : {'feat_level': 3, 'inputs_offsets': [0, 7]} I0625 17:16:10.716856 139906640938816 efficientdet_keras.py:760] fnode 4 : {'feat_level': 4, 'inputs_offsets': [1, 7, 8]} I0625 17:16:10.717847 139906640938816 efficientdet_keras.py:760] fnode 5 : {'feat_level': 5, 'inputs_offsets': [2, 6, 9]} I0625 17:16:10.718826 139906640938816 efficientdet_keras.py:760] fnode 6 : {'feat_level': 6, 'inputs_offsets': [3, 5, 10]} I0625 17:16:10.719833 139906640938816 efficientdet_keras.py:760] fnode 7 : {'feat_level': 7, 'inputs_offsets': [4, 11]} I0625 17:16:10.842185 139906640938816 efficientnet_model.py:735] Built stem stem : (4, 448, 448, 40) I0625 17:16:10.842446 139906640938816 efficientnet_model.py:756] block_0 survival_prob: 1.0 I0625 17:16:10.842933 139906640938816 efficientnet_model.py:374] Block blocks_0 input shape: (4, 448, 448, 40) I0625 17:16:10.878249 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 448, 448, 40) I0625 17:16:10.905101 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 40) I0625 17:16:10.938494 139906640938816 efficientnet_model.py:414] Project shape: (4, 448, 448, 24) I0625 17:16:10.938880 139906640938816 efficientnet_model.py:756] block_1 survival_prob: 0.9923076923076923 I0625 17:16:10.939317 139906640938816 efficientnet_model.py:374] Block blocks_1 input shape: (4, 448, 448, 24) I0625 17:16:10.974411 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 448, 448, 24) I0625 17:16:11.001070 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 24) I0625 17:16:11.042366 139906640938816 efficientnet_model.py:414] Project shape: (4, 448, 448, 24) I0625 17:16:11.042782 139906640938816 efficientnet_model.py:756] block_2 survival_prob: 0.9846153846153847 I0625 17:16:11.043238 139906640938816 efficientnet_model.py:374] Block blocks_2 input shape: (4, 448, 448, 24) I0625 17:16:11.077306 139906640938816 efficientnet_model.py:390] Expand shape: (4, 448, 448, 144) I0625 17:16:11.112699 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 224, 224, 144) I0625 17:16:11.139573 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 144) I0625 17:16:11.173578 139906640938816 efficientnet_model.py:414] Project shape: (4, 224, 224, 32) I0625 17:16:11.173918 139906640938816 efficientnet_model.py:756] block_3 survival_prob: 0.9769230769230769 I0625 17:16:11.174350 139906640938816 efficientnet_model.py:374] Block blocks_3 input shape: (4, 224, 224, 32) I0625 17:16:11.209160 139906640938816 efficientnet_model.py:390] Expand shape: (4, 224, 224, 192) I0625 17:16:11.244241 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 224, 224, 192) I0625 17:16:11.271175 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 192) I0625 17:16:11.312167 139906640938816 efficientnet_model.py:414] Project shape: (4, 224, 224, 32) I0625 17:16:11.312540 139906640938816 efficientnet_model.py:756] block_4 survival_prob: 0.9692307692307692 I0625 17:16:11.312978 139906640938816 efficientnet_model.py:374] Block blocks_4 input shape: (4, 224, 224, 32) I0625 17:16:11.347180 139906640938816 efficientnet_model.py:390] Expand shape: (4, 224, 224, 192) I0625 17:16:11.382422 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 224, 224, 192) I0625 17:16:11.409446 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 192) I0625 17:16:11.450609 139906640938816 efficientnet_model.py:414] Project shape: (4, 224, 224, 32) I0625 17:16:11.451075 139906640938816 efficientnet_model.py:756] block_5 survival_prob: 0.9615384615384616 I0625 17:16:11.451523 139906640938816 efficientnet_model.py:374] Block blocks_5 input shape: (4, 224, 224, 32) I0625 17:16:11.485887 139906640938816 efficientnet_model.py:390] Expand shape: (4, 224, 224, 192) I0625 17:16:11.522076 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 112, 112, 192) I0625 17:16:11.549073 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 192) I0625 17:16:11.582345 139906640938816 efficientnet_model.py:414] Project shape: (4, 112, 112, 48) I0625 17:16:11.582702 139906640938816 efficientnet_model.py:756] block_6 survival_prob: 0.9538461538461539 I0625 17:16:11.583166 139906640938816 efficientnet_model.py:374] Block blocks_6 input shape: (4, 112, 112, 48) I0625 17:16:11.617572 139906640938816 efficientnet_model.py:390] Expand shape: (4, 112, 112, 288) I0625 17:16:11.652944 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 112, 112, 288) I0625 17:16:11.680185 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 288) I0625 17:16:11.721240 139906640938816 efficientnet_model.py:414] Project shape: (4, 112, 112, 48) I0625 17:16:11.721596 139906640938816 efficientnet_model.py:756] block_7 survival_prob: 0.9461538461538461 I0625 17:16:11.722033 139906640938816 efficientnet_model.py:374] Block blocks_7 input shape: (4, 112, 112, 48) I0625 17:16:11.756512 139906640938816 efficientnet_model.py:390] Expand shape: (4, 112, 112, 288) I0625 17:16:11.791856 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 112, 112, 288) I0625 17:16:11.818970 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 288) I0625 17:16:11.860406 139906640938816 efficientnet_model.py:414] Project shape: (4, 112, 112, 48) I0625 17:16:11.860868 139906640938816 efficientnet_model.py:756] block_8 survival_prob: 0.9384615384615385 I0625 17:16:11.861303 139906640938816 efficientnet_model.py:374] Block blocks_8 input shape: (4, 112, 112, 48) I0625 17:16:11.895908 139906640938816 efficientnet_model.py:390] Expand shape: (4, 112, 112, 288) I0625 17:16:11.931401 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 288) I0625 17:16:11.958648 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 288) I0625 17:16:11.992002 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 96) I0625 17:16:11.992374 139906640938816 efficientnet_model.py:756] block_9 survival_prob: 0.9307692307692308 I0625 17:16:11.992817 139906640938816 efficientnet_model.py:374] Block blocks_9 input shape: (4, 56, 56, 96) I0625 17:16:12.027576 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 576) I0625 17:16:12.063276 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 576) I0625 17:16:12.090582 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 576) I0625 17:16:12.131697 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 96) I0625 17:16:12.132057 139906640938816 efficientnet_model.py:756] block_10 survival_prob: 0.9230769230769231 I0625 17:16:12.132494 139906640938816 efficientnet_model.py:374] Block blocks_10 input shape: (4, 56, 56, 96) I0625 17:16:12.167734 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 576) I0625 17:16:12.203525 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 576) I0625 17:16:12.230700 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 576) I0625 17:16:12.271693 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 96) I0625 17:16:12.272066 139906640938816 efficientnet_model.py:756] block_11 survival_prob: 0.9153846153846155 I0625 17:16:12.272504 139906640938816 efficientnet_model.py:374] Block blocks_11 input shape: (4, 56, 56, 96) I0625 17:16:12.307282 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 576) I0625 17:16:12.342971 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 576) I0625 17:16:12.370330 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 576) I0625 17:16:12.411368 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 96) I0625 17:16:12.411735 139906640938816 efficientnet_model.py:756] block_12 survival_prob: 0.9076923076923077 I0625 17:16:12.412176 139906640938816 efficientnet_model.py:374] Block blocks_12 input shape: (4, 56, 56, 96) I0625 17:16:12.447153 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 576) I0625 17:16:12.483055 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 576) I0625 17:16:12.511542 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 576) I0625 17:16:12.554342 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 96) I0625 17:16:12.554714 139906640938816 efficientnet_model.py:756] block_13 survival_prob: 0.9 I0625 17:16:12.555181 139906640938816 efficientnet_model.py:374] Block blocks_13 input shape: (4, 56, 56, 96) I0625 17:16:12.590065 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 576) I0625 17:16:12.625737 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 576) I0625 17:16:12.653147 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 576) I0625 17:16:12.686486 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 136) I0625 17:16:12.686845 139906640938816 efficientnet_model.py:756] block_14 survival_prob: 0.8923076923076924 I0625 17:16:12.687312 139906640938816 efficientnet_model.py:374] Block blocks_14 input shape: (4, 56, 56, 136) I0625 17:16:12.722139 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 816) I0625 17:16:12.758525 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 816) I0625 17:16:12.786060 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 816) I0625 17:16:12.827103 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 136) I0625 17:16:12.827462 139906640938816 efficientnet_model.py:756] block_15 survival_prob: 0.8846153846153847 I0625 17:16:12.827904 139906640938816 efficientnet_model.py:374] Block blocks_15 input shape: (4, 56, 56, 136) I0625 17:16:12.863444 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 816) I0625 17:16:12.899500 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 816) I0625 17:16:12.927205 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 816) I0625 17:16:12.968337 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 136) I0625 17:16:12.968709 139906640938816 efficientnet_model.py:756] block_16 survival_prob: 0.8769230769230769 I0625 17:16:12.969148 139906640938816 efficientnet_model.py:374] Block blocks_16 input shape: (4, 56, 56, 136) I0625 17:16:13.004360 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 816) I0625 17:16:13.040476 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 816) I0625 17:16:13.068101 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 816) I0625 17:16:13.109157 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 136) I0625 17:16:13.109530 139906640938816 efficientnet_model.py:756] block_17 survival_prob: 0.8692307692307693 I0625 17:16:13.109971 139906640938816 efficientnet_model.py:374] Block blocks_17 input shape: (4, 56, 56, 136) I0625 17:16:13.146105 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 816) I0625 17:16:13.182468 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 56, 56, 816) I0625 17:16:13.210210 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 816) I0625 17:16:13.251457 139906640938816 efficientnet_model.py:414] Project shape: (4, 56, 56, 136) I0625 17:16:13.251932 139906640938816 efficientnet_model.py:756] block_18 survival_prob: 0.8615384615384616 I0625 17:16:13.252383 139906640938816 efficientnet_model.py:374] Block blocks_18 input shape: (4, 56, 56, 136) I0625 17:16:13.287883 139906640938816 efficientnet_model.py:390] Expand shape: (4, 56, 56, 816) I0625 17:16:13.324238 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 816) I0625 17:16:13.351985 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 816) I0625 17:16:13.385221 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:13.385610 139906640938816 efficientnet_model.py:756] block_19 survival_prob: 0.8538461538461539 I0625 17:16:13.386058 139906640938816 efficientnet_model.py:374] Block blocks_19 input shape: (4, 28, 28, 232) I0625 17:16:13.559290 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:13.603468 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:13.633133 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:13.674879 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:13.675415 139906640938816 efficientnet_model.py:756] block_20 survival_prob: 0.8461538461538463 I0625 17:16:13.675885 139906640938816 efficientnet_model.py:374] Block blocks_20 input shape: (4, 28, 28, 232) I0625 17:16:13.716887 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:13.758516 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:13.787744 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:13.829301 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:13.829815 139906640938816 efficientnet_model.py:756] block_21 survival_prob: 0.8384615384615385 I0625 17:16:13.830308 139906640938816 efficientnet_model.py:374] Block blocks_21 input shape: (4, 28, 28, 232) I0625 17:16:13.872681 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:13.914737 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:13.944356 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:13.985895 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:13.986343 139906640938816 efficientnet_model.py:756] block_22 survival_prob: 0.8307692307692308 I0625 17:16:13.986801 139906640938816 efficientnet_model.py:374] Block blocks_22 input shape: (4, 28, 28, 232) I0625 17:16:14.027969 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:14.069461 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:14.099914 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:14.143105 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:14.143692 139906640938816 efficientnet_model.py:756] block_23 survival_prob: 0.8230769230769232 I0625 17:16:14.144236 139906640938816 efficientnet_model.py:374] Block blocks_23 input shape: (4, 28, 28, 232) I0625 17:16:14.186516 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:14.229245 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:14.259201 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:14.301084 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 232) I0625 17:16:14.301601 139906640938816 efficientnet_model.py:756] block_24 survival_prob: 0.8153846153846154 I0625 17:16:14.302068 139906640938816 efficientnet_model.py:374] Block blocks_24 input shape: (4, 28, 28, 232) I0625 17:16:14.343788 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 1392) I0625 17:16:14.387208 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 1392) I0625 17:16:14.416817 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 1392) I0625 17:16:14.450587 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 384) I0625 17:16:14.451113 139906640938816 efficientnet_model.py:756] block_25 survival_prob: 0.8076923076923077 I0625 17:16:14.451584 139906640938816 efficientnet_model.py:374] Block blocks_25 input shape: (4, 28, 28, 384) I0625 17:16:14.493397 139906640938816 efficientnet_model.py:390] Expand shape: (4, 28, 28, 2304) I0625 17:16:14.536113 139906640938816 efficientnet_model.py:393] DWConv shape: (4, 28, 28, 2304) I0625 17:16:14.566161 139906640938816 efficientnet_model.py:195] Built SE se : (4, 1, 1, 2304) I0625 17:16:14.608162 139906640938816 efficientnet_model.py:414] Project shape: (4, 28, 28, 384) I0625 17:16:20.420781 139906640938816 det_model_fn.py:81] LR schedule method: cosine I0625 17:16:20.803363 139906640938816 utils.py:373] Adding scale summary ('lrn_rate', <tf.Tensor 'Select:0' shape=() dtype=float32>) I0625 17:16:20.806988 139906640938816 utils.py:373] Adding scale summary ('trainloss/cls_loss', <tf.Tensor 'AddN:0' shape=() dtype=float32>) I0625 17:16:20.810234 139906640938816 utils.py:373] Adding scale summary ('trainloss/box_loss', <tf.Tensor 'AddN_1:0' shape=() dtype=float32>) I0625 17:16:20.813514 139906640938816 utils.py:373] Adding scale summary ('trainloss/det_loss', <tf.Tensor 'add_3:0' shape=() dtype=float32>) I0625 17:16:20.816778 139906640938816 utils.py:373] Adding scale summary ('trainloss/reg_l2_loss', <tf.Tensor 'mul_14:0' shape=() dtype=float32>) I0625 17:16:20.820070 139906640938816 utils.py:373] Adding scale summary ('trainloss/loss', <tf.Tensor 'add_4:0' shape=() dtype=float32>) I0625 17:16:20.825082 139906640938816 utils.py:373] Adding scale summary ('train_epochs', <tf.Tensor 'truediv_7:0' shape=() dtype=float32>) I0625 17:16:20.851104 139906640938816 det_model_fn.py:397] clip gradients norm by 10.000000 I0625 17:16:36.630142 139906640938816 utils.py:373] Adding scale summary ('gradient_norm', <tf.Tensor 'clip/global_norm_1/global_norm:0' shape=() dtype=float32>) I0625 17:16:57.169491 139906640938816 det_model_fn.py:539] restore variables from efficientdet-d3 I0625 17:16:57.169736 139906640938816 utils.py:99] Init model from checkpoint efficientdet-d3 I0625 17:16:57.180786 139906640938816 utils.py:155] Init global_step from ckpt var global_step I0625 17:16:57.180922 139906640938816 utils.py:155] Init efficientnet-b3/stem/conv2d/kernel from ckpt var efficientnet-b3/stem/conv2d/kernel I0625 17:16:57.181056 139906640938816 utils.py:155] Init efficientnet-b3/stem/tpu_batch_normalization/gamma from ckpt var efficientnet-b3/stem/tpu_batch_normalization/gamma I0625 17:16:57.181187 139906640938816 utils.py:155] Init efficientnet-b3/stem/tpu_batch_normalization/beta from ckpt var efficientnet-b3/stem/tpu_batch_normalization/beta I0625 17:16:57.181307 139906640938816 utils.py:155] Init efficientnet-b3/stem/tpu_batch_normalization/moving_mean from ckpt var efficientnet-b3/stem/tpu_batch_normalization/moving_mean I0625 17:16:57.190458 139906640938816 utils.py:148] skip class_net/class-predict/pointwise_kernel ((1, 1, 160, 279) vs [1, 1, 160, 810]) -- shape mismatch I0625 17:16:57.190581 139906640938816 utils.py:148] skip class_net/class-predict/bias ((279,) vs [810]) -- shape mismatch I0625 17:16:57.199553 139906640938816 utils.py:148] skip class_net/class-predict/bias/ExponentialMovingAverage ((279,) vs [810]) -- shape mismatch I0625 17:16:57.201640 139906640938816 utils.py:148] skip class_net/class-predict/pointwise_kernel/ExponentialMovingAverage ((1, 1, 160, 279) vs [1, 1, 160, 810]) -- shape mismatch INFO:tensorflow:Done calling model_fn. I0625 17:17:04.020421 139906640938816 estimator.py:1164] Done calling model_fn. INFO:tensorflow:Create CheckpointSaverHook. I0625 17:17:04.021797 139906640938816 basic_session_run_hooks.py:546] Create CheckpointSaverHook. INFO:tensorflow:Graph was finalized. I0625 17:17:20.191052 139906640938816 monitored_session.py:246] Graph was finalized. 2021-06-25 17:17:20.192039: I tensorflow/compiler/jit/xla_gpu_device.cc:99] Not creating XLA devices, tf_xla_enable_xla_devices not set 2021-06-25 17:17:20.192304: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.193380: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: pciBusID: 0000:00:1e.0 name: Tesla V100-SXM2-16GB computeCapability: 7.0 coreClock: 1.53GHz coreCount: 80 deviceMemorySize: 15.78GiB deviceMemoryBandwidth: 836.37GiB/s 2021-06-25 17:17:20.193443: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 2021-06-25 17:17:20.193494: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.11 2021-06-25 17:17:20.193520: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublasLt.so.11 2021-06-25 17:17:20.193552: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcufft.so.10 2021-06-25 17:17:20.193594: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcurand.so.10 2021-06-25 17:17:20.193646: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusolver.so.10 2021-06-25 17:17:20.193684: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcusparse.so.11 2021-06-25 17:17:20.193735: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudnn.so.8 2021-06-25 17:17:20.193848: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.194849: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.195807: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1862] Adding visible gpu devices: 0 2021-06-25 17:17:20.195871: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0 2021-06-25 17:17:20.870767: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1261] Device interconnect StreamExecutor with strength 1 edge matrix: 2021-06-25 17:17:20.870836: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1267] 0 2021-06-25 17:17:20.870849: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1280] 0: N 2021-06-25 17:17:20.871154: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.872225: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.873212: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero 2021-06-25 17:17:20.874142: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:39] Overriding allow_growth setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0. 2021-06-25 17:17:20.874199: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1406] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 14760 MB memory) -> physical GPU (device: 0, name: Tesla V100-SXM2-16GB, pci bus id: 0000:00:1e.0, compute capability: 7.0) INFO:tensorflow:Restoring parameters from model_dir/efficientdet-d3-finetune-01/model.ckpt-153100 I0625 17:17:20.875761 139906640938816 saver.py:1292] Restoring parameters from model_dir/efficientdet-d3-finetune-01/model.ckpt-153100 2021-06-25 17:17:22.445949: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:196] None of the MLIR optimization passes are enabled (registered 0 passes) 2021-06-25 17:17:25.647201: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2300005000 Hz 2021-06-25 17:17:31.018295: W tensorflow/core/framework/op_kernel.cc:1763] OP_REQUIRES failed at save_restore_v2_ops.cc:205 : Not found: Key efficientnet-b3/blocks_0/conv2d/kernel/Momentum not found in checkpoint Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1375, in _do_call return fn(*args) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1360, in _run_fn target_list, run_metadata) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1453, in _call_tf_sessionrun run_metadata) tensorflow.python.framework.errors_impl.NotFoundError: 2 root error(s) found. (0) Not found: Key efficientnet-b3/blocks_0/conv2d/kernel/Momentum not found in checkpoint [[{{node save/RestoreV2}}]] (1) Not found: Key efficientnet-b3/blocks_0/conv2d/kernel/Momentum not found in checkpoint [[{{node save/RestoreV2}}]] [[save/RestoreV2/_967]] 0 successful operations. 0 derived errors ignored.

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 1298, in restore {self.saver_def.filename_tensor_name: save_path}) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 968, in run run_metadata_ptr) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1191, in _run feed_dict_tensor, options, run_metadata) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1369, in _do_run run_metadata) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/client/session.py", line 1394, in _do_call raise type(e)(node_def, op, message)

Original stack trace for 'save/RestoreV2': File "main.py", line 402, in app.run(main) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 312, in run _run_main(main, args) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main sys.exit(main(argv)) File "main.py", line 333, in main train_est.train(input_fn=train_input_fn, max_steps=train_steps) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 349, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1175, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1208, in _train_model_default saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1510, in _train_with_estimator_spec save_graph_def=self._config.checkpoint_save_graph_def) as mon_sess: File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 604, in MonitoredTrainingSession stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1038, in init stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 749, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1231, in init _WrappedSession.init(self, self._create_session()) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1236, in _create_session return self._sess_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 902, in create_session self.tf_sess = self._session_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 660, in create_session self._scaffold.finalize() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 243, in finalize self._saver.build() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 847, in build self._build(self._filename, build_save=True, build_restore=True) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 885, in _build build_restore=build_restore) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 509, in _build_internal restore_sequentially, reshape) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 388, in _AddShardedRestoreOps name="restore_shard")) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 335, in _AddRestoreOps restore_sequentially) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 582, in bulk_restore return io_ops.restore_v2(filename_tensor, names, slices, dtypes) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1511, in restore_v2 name=name) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 750, in _apply_op_helper attrs=attr_protos, op_def=op_def) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3536, in _create_op_internal op_def=op_def) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1990, in init self._traceback = tf_stack.extract_stack()

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/py_checkpoint_reader.py", line 70, in get_tensor self, compat.as_bytes(tensor_str)) RuntimeError: Key _CHECKPOINTABLE_OBJECT_GRAPH not found in checkpoint

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 1308, in restore names_to_keys = object_graph_key_mapping(save_path) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 1626, in object_graph_key_mapping object_graph_string = reader.get_tensor(trackable.OBJECT_GRAPH_PROTO_KEY) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/py_checkpoint_reader.py", line 74, in get_tensor error_translator(e) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/py_checkpoint_reader.py", line 35, in error_translator raise errors_impl.NotFoundError(None, None, error_message) tensorflow.python.framework.errors_impl.NotFoundError: Key _CHECKPOINTABLE_OBJECT_GRAPH not found in checkpoint

During handling of the above exception, another exception occurred:

Traceback (most recent call last): File "main.py", line 402, in app.run(main) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 312, in run _run_main(main, args) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main sys.exit(main(argv)) File "main.py", line 333, in main train_est.train(input_fn=train_input_fn, max_steps=train_steps) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 349, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1175, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1208, in _train_model_default saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1510, in _train_with_estimator_spec save_graph_def=self._config.checkpoint_save_graph_def) as mon_sess: File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 604, in MonitoredTrainingSession stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1038, in init stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 749, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1231, in init _WrappedSession.init(self, self._create_session()) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1236, in _create_session return self._sess_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 902, in create_session self.tf_sess = self._session_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 669, in create_session init_fn=self._scaffold.init_fn) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/session_manager.py", line 295, in prepare_session config=config) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/session_manager.py", line 225, in _restore_checkpoint saver.restore(sess, ckpt.model_checkpoint_path) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 1314, in restore err, "a Variable name or other graph key that is missing")

(0) Not found: Key efficientnet-b3/blocks_0/conv2d/kernel/Momentum not found in checkpoint [[node save/RestoreV2 (defined at /home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py:1510) ]] (1) Not found: Key efficientnet-b3/blocks_0/conv2d/kernel/Momentum not found in checkpoint [[node save/RestoreV2 (defined at /home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py:1510) ]] [[save/RestoreV2/_967]] 0 successful operations. 0 derived errors ignored.

Original stack trace for 'save/RestoreV2': File "main.py", line 402, in app.run(main) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 312, in run _run_main(main, args) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/absl/app.py", line 258, in _run_main sys.exit(main(argv)) File "main.py", line 333, in main train_est.train(input_fn=train_input_fn, max_steps=train_steps) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 349, in train loss = self._train_model(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1175, in _train_model return self._train_model_default(input_fn, hooks, saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1208, in _train_model_default saving_listeners) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1510, in _train_with_estimator_spec save_graph_def=self._config.checkpoint_save_graph_def) as mon_sess: File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 604, in MonitoredTrainingSession stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1038, in init stop_grace_period_secs=stop_grace_period_secs) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 749, in init self._sess = _RecoverableSession(self._coordinated_creator) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1231, in init _WrappedSession.init(self, self._create_session()) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 1236, in _create_session return self._sess_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 902, in create_session self.tf_sess = self._session_creator.create_session() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 660, in create_session self._scaffold.finalize() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/monitored_session.py", line 243, in finalize self._saver.build() File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 847, in build self._build(self._filename, build_save=True, build_restore=True) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 885, in _build build_restore=build_restore) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 509, in _build_internal restore_sequentially, reshape) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 388, in _AddShardedRestoreOps name="restore_shard")) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 335, in _AddRestoreOps restore_sequentially) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/training/saver.py", line 582, in bulk_restore return io_ops.restore_v2(filename_tensor, names, slices, dtypes) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1511, in restore_v2 name=name) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 750, in _apply_op_helper attrs=attr_protos, op_def=op_def) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3536, in _create_op_internal op_def=op_def) File "/home/ubuntu/anaconda3/envs/effdet/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1990, in init self._traceback = tf_stack.extract_stack() '''

My understanding is that since I freeze the backbone part during the first 50 epoch, it hasn't been saved in the checkpoint file. And I would like to do further fine-tuning without freeze any part of the network. So, the checkpoint file doesn't work now. I was wondering is there any solution for this problem?

Any suggestions would be greatly appreciated.

Thanks

ChulanZhang avatar Jun 25 '21 17:06 ChulanZhang