PaddleOCR icon indicating copy to clipboard operation
PaddleOCR copied to clipboard

模型评估时出现显存溢出

Open qiuming-93 opened this issue 3 years ago • 4 comments

请提供下述完整信息以便快速定位问题/Please provide the following information to quickly locate the problem

  • 系统环境/System Environment:centos
  • 版本号/Version:Paddle:2.1.2 PaddleOCR:2.4.0
  • 运行指令/Command Code: python myself_eval.py -c /share/configs/det_config.yml

参考paddleocr中的eval.py脚本实现了一个评估自己定义的网络的脚本myself_eval.py,代码如下: ` from future import absolute_import from future import division from future import print_function

import os import platform import sys

dir = os.path.dirname(os.path.abspath(file))

import time

import paddle

sys.path.append(dir) sys.path.append(os.path.abspath(os.path.join(dir, '..')))

from ppocr.data import build_dataloader from ppocr.postprocess import build_post_process from ppocr.metrics import build_metric import tools.program as program from tqdm import tqdm

def main(): global_config = config['Global'] # build dataloader valid_dataloader = build_dataloader(config, 'Eval', device, logger)

# build post process
post_process_class = build_post_process(config['PostProcess'],
                                        global_config)

# build model
# for rec algorithm
if hasattr(post_process_class, 'character'):
    char_num = len(getattr(post_process_class, 'character'))
    if config['Architecture']["algorithm"] in ["Distillation",
                                               ]:  # distillation model
        for key in config['Architecture']["Models"]:
            config['Architecture']["Models"][key]["Head"][
                'out_channels'] = char_num
    else:  # base rec model
        config['Architecture']["Head"]['out_channels'] = char_num

extra_input = config['Architecture'][
    'algorithm'] in ["SRN", "NRTR", "SAR", "SEED"]
if "model_type" in config['Architecture'].keys():
    model_type = config['Architecture']['model_type']
else:
    model_type = None
network_type = config['Global'].get('network_type', 'paddle')
import network
model = network.NetWork()
params = paddle.load('network.pdparams')
state_dict = model.state_dict()
for k1 in params.keys():
	if k1 not in state_dict.keys():
		print("The pretrained params {} not in model".format(k1))
	else:
		state_dict[k1] = params[k1]
model.set_state_dict(state_dict)
# build metric
eval_class = build_metric(config['Metric'])
# start eval
metric = paddle_eval(model, valid_dataloader, post_process_class,
                          eval_class, model_type, extra_input)
logger.info('metric eval ***************')
for k, v in metric.items():
    logger.info('{}:{}'.format(k, v))

def paddle_eval(model, valid_dataloader, post_process_class, eval_class, model_type=None, extra_input=False): model.eval() with paddle.no_grad(): total_frame = 0.0 total_time = 0.0 pbar = tqdm( total=len(valid_dataloader), desc='eval model:', position=0, leave=True) max_iter = len(valid_dataloader) - 1 if platform.system( ) == "Windows" else len(valid_dataloader) for idx, batch in enumerate(valid_dataloader): if idx >= max_iter: break images = batch[0] start = time.time() if model_type == 'table' or extra_input: preds = model(images, data=batch[1:]) elif model_type == "kie": preds = model(batch) else: preds = model(images) # 根据不同的网络进行修改适配 preds = {'maps': preds} batch = [item.numpy() for item in batch] # Obtain usable results from post-processing methods total_time += time.time() - start # Evaluate the results of the current batch if model_type in ['table', 'kie']: eval_class(preds, batch) else: post_result = post_process_class(preds, batch[1]) eval_class(post_result, batch)

        pbar.update(1)
        total_frame += len(images)
# Get final metric,eg. acc or hmean
metric = eval_class.get_metric()

pbar.close()
model.train()
metric['fps'] = total_frame / total_time
return metric

if name == 'main': config, device, logger, vdl_writer = program.preprocess() main() `

同时在同目录下新增了自定义的网络脚本network.py,代码如下 ` import paddle import paddle.nn as nn import paddle.nn.functional as F from paddle import ParamAttr

class NetWork(nn.Layer): def init(self): super(NetWork, self).init() self.conv2d_33 = nn.Conv2D(stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=3, out_channels=32, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_0.w_0')) self.batch_norm_0 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_0.b_0'), num_channels=32, moving_mean_name='generate_batch_norm_0.w_1', param_attr=ParamAttr(name='generate_batch_norm_0.w_0'), moving_variance_name='generate_batch_norm_0.w_2') self.conv2d_34 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=32, out_channels=32, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_1.w_0')) self.batch_norm_1 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_1.b_0'), num_channels=32, moving_mean_name='generate_batch_norm_1.w_1', param_attr=ParamAttr(name='generate_batch_norm_1.w_0'), moving_variance_name='generate_batch_norm_1.w_2') self.conv2d_35 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=32, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_2.w_0')) self.batch_norm_2 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_2.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_2.w_1', param_attr=ParamAttr(name='generate_batch_norm_2.w_0'), moving_variance_name='generate_batch_norm_2.w_2') self.pool2d_0 = nn.MaxPool2D(stride=[2, 2], padding=[1, 1], kernel_size=[3, 3], ceil_mode=False, data_format='NCHW') self.conv2d_36 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_3.w_0')) self.batch_norm_3 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_3.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_3.w_1', param_attr=ParamAttr(name='generate_batch_norm_3.w_0'), moving_variance_name='generate_batch_norm_3.w_2') self.conv2d_37 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_4.w_0')) self.batch_norm_4 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_4.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_4.w_1', param_attr=ParamAttr(name='generate_batch_norm_4.w_0'), moving_variance_name='generate_batch_norm_4.w_2') self.conv2d_38 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_5.w_0')) self.batch_norm_5 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_5.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_5.w_1', param_attr=ParamAttr(name='generate_batch_norm_5.w_0'), moving_variance_name='generate_batch_norm_5.w_2') self.conv2d_39 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_6.w_0')) self.batch_norm_6 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_6.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_6.w_1', param_attr=ParamAttr(name='generate_batch_norm_6.w_0'), moving_variance_name='generate_batch_norm_6.w_2') self.conv2d_40 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_7.w_0')) self.batch_norm_7 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_7.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_7.w_1', param_attr=ParamAttr(name='generate_batch_norm_7.w_0'), moving_variance_name='generate_batch_norm_7.w_2') self.conv2d_41 = nn.Conv2D(stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=128, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_8.w_0')) self.batch_norm_8 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_8.b_0'), num_channels=128, moving_mean_name='generate_batch_norm_8.w_1', param_attr=ParamAttr(name='generate_batch_norm_8.w_0'), moving_variance_name='generate_batch_norm_8.w_2') self.conv2d_42 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=128, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_9.w_0')) self.batch_norm_9 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_9.b_0'), num_channels=128, moving_mean_name='generate_batch_norm_9.w_1', param_attr=ParamAttr(name='generate_batch_norm_9.w_0'), moving_variance_name='generate_batch_norm_9.w_2') self.pool2d_1 = nn.AvgPool2D(stride=[2, 2], padding=[0, 0], kernel_size=[2, 2], ceil_mode=True, data_format='NCHW') self.conv2d_43 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=128, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_10.w_0')) self.batch_norm_10 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_10.b_0'), num_channels=128, moving_mean_name='generate_batch_norm_10.w_1', param_attr=ParamAttr(name='generate_batch_norm_10.w_0'), moving_variance_name='generate_batch_norm_10.w_2') self.conv2d_44 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=128, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_11.w_0')) self.batch_norm_11 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_11.b_0'), num_channels=128, moving_mean_name='generate_batch_norm_11.w_1', param_attr=ParamAttr(name='generate_batch_norm_11.w_0'), moving_variance_name='generate_batch_norm_11.w_2') self.conv2d_45 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=128, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_12.w_0')) self.batch_norm_12 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_12.b_0'), num_channels=128, moving_mean_name='generate_batch_norm_12.w_1', param_attr=ParamAttr(name='generate_batch_norm_12.w_0'), moving_variance_name='generate_batch_norm_12.w_2') self.conv2d_46 = nn.Conv2D(stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=256, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_13.w_0')) self.batch_norm_13 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_13.b_0'), num_channels=256, moving_mean_name='generate_batch_norm_13.w_1', param_attr=ParamAttr(name='generate_batch_norm_13.w_0'), moving_variance_name='generate_batch_norm_13.w_2') self.conv2d_47 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=256, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_14.w_0')) self.batch_norm_14 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_14.b_0'), num_channels=256, moving_mean_name='generate_batch_norm_14.w_1', param_attr=ParamAttr(name='generate_batch_norm_14.w_0'), moving_variance_name='generate_batch_norm_14.w_2') self.pool2d_2 = nn.AvgPool2D(stride=[2, 2], padding=[0, 0], kernel_size=[2, 2], ceil_mode=True, data_format='NCHW') self.conv2d_48 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=256, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_15.w_0')) self.batch_norm_15 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_15.b_0'), num_channels=256, moving_mean_name='generate_batch_norm_15.w_1', param_attr=ParamAttr(name='generate_batch_norm_15.w_0'), moving_variance_name='generate_batch_norm_15.w_2') self.conv2d_49 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=256, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_16.w_0')) self.batch_norm_16 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_16.b_0'), num_channels=256, moving_mean_name='generate_batch_norm_16.w_1', param_attr=ParamAttr(name='generate_batch_norm_16.w_0'), moving_variance_name='generate_batch_norm_16.w_2') self.conv2d_50 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=256, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_17.w_0')) self.batch_norm_17 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_17.b_0'), num_channels=256, moving_mean_name='generate_batch_norm_17.w_1', param_attr=ParamAttr(name='generate_batch_norm_17.w_0'), moving_variance_name='generate_batch_norm_17.w_2') self.conv2d_51 = nn.Conv2D(stride=[2, 2], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=512, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_18.w_0')) self.batch_norm_18 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_18.b_0'), num_channels=512, moving_mean_name='generate_batch_norm_18.w_1', param_attr=ParamAttr(name='generate_batch_norm_18.w_0'), moving_variance_name='generate_batch_norm_18.w_2') self.conv2d_52 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=512, out_channels=512, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_19.w_0')) self.batch_norm_19 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_19.b_0'), num_channels=512, moving_mean_name='generate_batch_norm_19.w_1', param_attr=ParamAttr(name='generate_batch_norm_19.w_0'), moving_variance_name='generate_batch_norm_19.w_2') self.pool2d_3 = nn.AvgPool2D(stride=[2, 2], padding=[0, 0], kernel_size=[2, 2], ceil_mode=True, data_format='NCHW') self.conv2d_53 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=512, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_20.w_0')) self.batch_norm_20 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_20.b_0'), num_channels=512, moving_mean_name='generate_batch_norm_20.w_1', param_attr=ParamAttr(name='generate_batch_norm_20.w_0'), moving_variance_name='generate_batch_norm_20.w_2') self.conv2d_54 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=512, out_channels=512, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_21.w_0')) self.batch_norm_21 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_21.b_0'), num_channels=512, moving_mean_name='generate_batch_norm_21.w_1', param_attr=ParamAttr(name='generate_batch_norm_21.w_0'), moving_variance_name='generate_batch_norm_21.w_2') self.conv2d_55 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=512, out_channels=512, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_22.w_0')) self.batch_norm_22 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_22.b_0'), num_channels=512, moving_mean_name='generate_batch_norm_22.w_1', param_attr=ParamAttr(name='generate_batch_norm_22.w_0'), moving_variance_name='generate_batch_norm_22.w_2') self.conv2d_56 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=512, out_channels=256, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_26.w_0')) self.conv2d_57 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=256, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_25.w_0')) self.conv2d_58 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=128, out_channels=256, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_24.w_0')) self.conv2d_59 = nn.Conv2D(stride=[1, 1], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=256, kernel_size=[1, 1], weight_attr=ParamAttr(name='generate_conv2d_23.w_0')) self.conv2d_60 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_27.w_0')) self.conv2d_61 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_28.w_0')) self.conv2d_62 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_29.w_0')) self.conv2d_63 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_30.w_0')) self.conv2d_64 = nn.Conv2D(stride=[1, 1], padding=[1, 1], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=256, out_channels=64, kernel_size=[3, 3], weight_attr=ParamAttr(name='generate_conv2d_31.w_0')) self.batch_norm_23 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_23.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_23.w_1', param_attr=ParamAttr(name='generate_batch_norm_23.w_0'), moving_variance_name='generate_batch_norm_23.w_2') self.conv2d_transpose_4 = nn.Conv2DTranspose(stride=[2, 2], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=64, kernel_size=[2, 2], weight_attr=ParamAttr(name='generate_conv2d_transpose_0.w_0'), bias_attr=ParamAttr(name='generate_conv2d_transpose_0.b_0')) self.batch_norm_24 = nn.BatchNorm(is_test=True, momentum=0.8999999761581421, epsilon=9.999999747378752e-06, data_layout='NCHW', bias_attr=ParamAttr(name='generate_batch_norm_24.b_0'), num_channels=64, moving_mean_name='generate_batch_norm_24.w_1', param_attr=ParamAttr(name='generate_batch_norm_24.w_0'), moving_variance_name='generate_batch_norm_24.w_2') self.conv2d_transpose_5 = nn.Conv2DTranspose(stride=[2, 2], padding=[0, 0], dilation=[1, 1], groups=1, data_format='NCHW', in_channels=64, out_channels=1, kernel_size=[2, 2], weight_attr=ParamAttr(name='generate_conv2d_transpose_1.w_0'), bias_attr=ParamAttr(name='generate_conv2d_transpose_1.b_0'))

def forward(self, x):
	conv2d_33_tmp_0 = self.conv2d_33(x)
	batch_norm_0_tmp_3 = self.batch_norm_0(conv2d_33_tmp_0)
	batch_norm_0_tmp_4 = F.relu(batch_norm_0_tmp_3)
	conv2d_34_tmp_0 = self.conv2d_34(batch_norm_0_tmp_4)
	batch_norm_1_tmp_3 = self.batch_norm_1(conv2d_34_tmp_0)
	batch_norm_1_tmp_4 = F.relu(batch_norm_1_tmp_3)
	conv2d_35_tmp_0 = self.conv2d_35(batch_norm_1_tmp_4)
	batch_norm_2_tmp_3 = self.batch_norm_2(conv2d_35_tmp_0)
	batch_norm_2_tmp_4 = F.relu(batch_norm_2_tmp_3)
	pool2d_0_tmp_0 = self.pool2d_0(batch_norm_2_tmp_4)
	conv2d_36_tmp_0 = self.conv2d_36(pool2d_0_tmp_0)
	batch_norm_3_tmp_3 = self.batch_norm_3(conv2d_36_tmp_0)
	batch_norm_3_tmp_4 = F.relu(batch_norm_3_tmp_3)
	conv2d_37_tmp_0 = self.conv2d_37(batch_norm_3_tmp_4)
	batch_norm_4_tmp_3 = self.batch_norm_4(conv2d_37_tmp_0)
	conv2d_38_tmp_0 = self.conv2d_38(pool2d_0_tmp_0)
	batch_norm_5_tmp_3 = self.batch_norm_5(conv2d_38_tmp_0)
	elementwise_add_0 = paddle.add(batch_norm_5_tmp_3, batch_norm_4_tmp_3)
	relu_0_tmp_0 = F.relu(elementwise_add_0)
	conv2d_39_tmp_0 = self.conv2d_39(relu_0_tmp_0)
	batch_norm_6_tmp_3 = self.batch_norm_6(conv2d_39_tmp_0)
	batch_norm_6_tmp_4 = F.relu(batch_norm_6_tmp_3)
	conv2d_40_tmp_0 = self.conv2d_40(batch_norm_6_tmp_4)
	batch_norm_7_tmp_3 = self.batch_norm_7(conv2d_40_tmp_0)
	elementwise_add_1 = paddle.add(relu_0_tmp_0, batch_norm_7_tmp_3)
	relu_1_tmp_0 = F.relu(elementwise_add_1)
	conv2d_41_tmp_0 = self.conv2d_41(relu_1_tmp_0)
	batch_norm_8_tmp_3 = self.batch_norm_8(conv2d_41_tmp_0)
	batch_norm_8_tmp_4 = F.relu(batch_norm_8_tmp_3)
	conv2d_42_tmp_0 = self.conv2d_42(batch_norm_8_tmp_4)
	batch_norm_9_tmp_3 = self.batch_norm_9(conv2d_42_tmp_0)
	pool2d_1_tmp_0 = self.pool2d_1(relu_1_tmp_0)
	conv2d_43_tmp_0 = self.conv2d_43(pool2d_1_tmp_0)
	batch_norm_10_tmp_3 = self.batch_norm_10(conv2d_43_tmp_0)
	elementwise_add_2 = paddle.add(batch_norm_10_tmp_3, batch_norm_9_tmp_3)
	relu_2_tmp_0 = F.relu(elementwise_add_2)
	conv2d_44_tmp_0 = self.conv2d_44(relu_2_tmp_0)
	batch_norm_11_tmp_3 = self.batch_norm_11(conv2d_44_tmp_0)
	batch_norm_11_tmp_4 = F.relu(batch_norm_11_tmp_3)
	conv2d_45_tmp_0 = self.conv2d_45(batch_norm_11_tmp_4)
	batch_norm_12_tmp_3 = self.batch_norm_12(conv2d_45_tmp_0)
	elementwise_add_3 = paddle.add(relu_2_tmp_0, batch_norm_12_tmp_3)
	relu_3_tmp_0 = F.relu(elementwise_add_3)
	conv2d_46_tmp_0 = self.conv2d_46(relu_3_tmp_0)
	batch_norm_13_tmp_3 = self.batch_norm_13(conv2d_46_tmp_0)
	batch_norm_13_tmp_4 = F.relu(batch_norm_13_tmp_3)
	conv2d_47_tmp_0 = self.conv2d_47(batch_norm_13_tmp_4)
	batch_norm_14_tmp_3 = self.batch_norm_14(conv2d_47_tmp_0)
	pool2d_2_tmp_0 = self.pool2d_2(relu_3_tmp_0)
	conv2d_48_tmp_0 = self.conv2d_48(pool2d_2_tmp_0)
	batch_norm_15_tmp_3 = self.batch_norm_15(conv2d_48_tmp_0)
	elementwise_add_4 = paddle.add(batch_norm_15_tmp_3, batch_norm_14_tmp_3)
	relu_4_tmp_0 = F.relu(elementwise_add_4)
	conv2d_49_tmp_0 = self.conv2d_49(relu_4_tmp_0)
	batch_norm_16_tmp_3 = self.batch_norm_16(conv2d_49_tmp_0)
	batch_norm_16_tmp_4 = F.relu(batch_norm_16_tmp_3)
	conv2d_50_tmp_0 = self.conv2d_50(batch_norm_16_tmp_4)
	batch_norm_17_tmp_3 = self.batch_norm_17(conv2d_50_tmp_0)
	elementwise_add_5 = paddle.add(relu_4_tmp_0, batch_norm_17_tmp_3)
	relu_5_tmp_0 = F.relu(elementwise_add_5)
	conv2d_51_tmp_0 = self.conv2d_51(relu_5_tmp_0)
	batch_norm_18_tmp_3 = self.batch_norm_18(conv2d_51_tmp_0)
	batch_norm_18_tmp_4 = F.relu(batch_norm_18_tmp_3)
	conv2d_52_tmp_0 = self.conv2d_52(batch_norm_18_tmp_4)
	batch_norm_19_tmp_3 = self.batch_norm_19(conv2d_52_tmp_0)
	pool2d_3_tmp_0 = self.pool2d_3(relu_5_tmp_0)
	conv2d_53_tmp_0 = self.conv2d_53(pool2d_3_tmp_0)
	batch_norm_20_tmp_3 = self.batch_norm_20(conv2d_53_tmp_0)
	elementwise_add_6 = paddle.add(batch_norm_20_tmp_3, batch_norm_19_tmp_3)
	relu_6_tmp_0 = F.relu(elementwise_add_6)
	conv2d_54_tmp_0 = self.conv2d_54(relu_6_tmp_0)
	batch_norm_21_tmp_3 = self.batch_norm_21(conv2d_54_tmp_0)
	batch_norm_21_tmp_4 = F.relu(batch_norm_21_tmp_3)
	conv2d_55_tmp_0 = self.conv2d_55(batch_norm_21_tmp_4)
	batch_norm_22_tmp_3 = self.batch_norm_22(conv2d_55_tmp_0)
	elementwise_add_7 = paddle.add(relu_6_tmp_0, batch_norm_22_tmp_3)
	relu_7_tmp_0 = F.relu(elementwise_add_7)
	conv2d_56_tmp_0 = self.conv2d_56(relu_7_tmp_0)
	conv2d_57_tmp_0 = self.conv2d_57(relu_5_tmp_0)
	conv2d_58_tmp_0 = self.conv2d_58(relu_3_tmp_0)
	conv2d_59_tmp_0 = self.conv2d_59(relu_1_tmp_0)
	nearest_interp_v2_0_tmp_0 = F.upsample(conv2d_56_tmp_0, scale_factor=[2.0, 2.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	tmp_0 = paddle.add(conv2d_57_tmp_0, nearest_interp_v2_0_tmp_0)
	nearest_interp_v2_1_tmp_0 = F.upsample(tmp_0, scale_factor=[2.0, 2.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	tmp_1 = paddle.add(conv2d_58_tmp_0, nearest_interp_v2_1_tmp_0)
	nearest_interp_v2_2_tmp_0 = F.upsample(tmp_1, scale_factor=[2.0, 2.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	tmp_2 = paddle.add(conv2d_59_tmp_0, nearest_interp_v2_2_tmp_0)
	conv2d_60_tmp_0 = self.conv2d_60(conv2d_56_tmp_0)
	conv2d_61_tmp_0 = self.conv2d_61(tmp_0)
	conv2d_62_tmp_0 = self.conv2d_62(tmp_1)
	conv2d_63_tmp_0 = self.conv2d_63(tmp_2)
	nearest_interp_v2_3_tmp_0 = F.upsample(conv2d_60_tmp_0, scale_factor=[8.0, 8.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	nearest_interp_v2_4_tmp_0 = F.upsample(conv2d_61_tmp_0, scale_factor=[4.0, 4.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	nearest_interp_v2_5_tmp_0 = F.upsample(conv2d_62_tmp_0, scale_factor=[2.0, 2.0], mode='nearest', align_mode=1, align_corners=False, data_format='NCHW')
	concat_0_tmp_0 = paddle.concat([nearest_interp_v2_3_tmp_0,nearest_interp_v2_4_tmp_0,nearest_interp_v2_5_tmp_0,conv2d_63_tmp_0], axis=1)
	conv2d_64_tmp_0 = self.conv2d_64(concat_0_tmp_0)
	batch_norm_23_tmp_3 = self.batch_norm_23(conv2d_64_tmp_0)
	batch_norm_23_tmp_4 = F.relu(batch_norm_23_tmp_3)
	conv2d_transpose_4_tmp_0 = self.conv2d_transpose_4(batch_norm_23_tmp_4)
	elementwise_add_8_tmp_0 = conv2d_transpose_4_tmp_0
	batch_norm_24_tmp_3 = self.batch_norm_24(elementwise_add_8_tmp_0)
	batch_norm_24_tmp_4 = F.relu(batch_norm_24_tmp_3)
	conv2d_transpose_5_tmp_0 = self.conv2d_transpose_5(batch_norm_24_tmp_4)
	elementwise_add_9_tmp_0 = conv2d_transpose_5_tmp_0
	sigmoid_0_tmp_0 = F.sigmoid(elementwise_add_9_tmp_0)
	return sigmoid_0_tmp_0

` 使用T4进行评估,batsh_size为1,在评估的时候刚开始显存还是正常的,然后到了第102张图片之后显存增加了一倍多,然后在第133张时显存溢出,试了很多次都是这样

qiuming-93 avatar Aug 23 '22 03:08 qiuming-93

配置文件中关于评估部分的配置如下: Eval: dataset: name: SimpleDataSet data_dir: /share/train_data label_file_list: - /share/train_data/Label.txt ratio_list: - 1.0 transforms: - DecodeImage: img_mode: BGR channel_first: false - DetLabelEncode: null - DetResizeForTest: null - NormalizeImage: scale: 1./255. mean: - 0.485 - 0.456 - 0.406 std: - 0.229 - 0.224 - 0.225 order: hwc - ToCHWImage: null - KeepKeys: keep_keys: - image - shape - polys - ignore_tags loader: shuffle: false drop_last: false batch_size_per_card: 1 num_workers: 0

qiuming-93 avatar Aug 23 '22 03:08 qiuming-93

代码格式阅读起来有些困难,显存溢出可以从以下几个地方排查:

  1. 确认使用了no_grad
  2. 是否不断将Tensor加入到某个list容器,会导致容器中的tensor一直不能被析构,显存增加
  3. 是否在For循环中创建了model,导致显存不断增加

tink2123 avatar Aug 25 '22 11:08 tink2123

代码格式阅读起来有些困难,显存溢出可以从以下几个地方排查:

  1. 确认使用了no_grad
  2. 是否不断将Tensor加入到某个list容器,会导致容器中的tensor一直不能被析构,显存增加
  3. 是否在For循环中创建了model,导致显存不断增加

这三个都确认过了,没有问题的,而且问题不是每一次推理都会增加,而是在评估的时候刚开始显存还是正常的,然后到了第102张图片之后显存增加了一倍多,然后在第133张时显存溢出,试了很多次都是这样

qiuming-93 avatar Aug 31 '22 02:08 qiuming-93

请问解决了吗,我训练的时候也是正常,评估的时候,不管batch多小,总会显存溢出,因为显存是在不断的变大

luochenbeisheng avatar Oct 26 '22 10:10 luochenbeisheng