mmpretrain icon indicating copy to clipboard operation
mmpretrain copied to clipboard

[Bug]在cpu环境中,dataloder会出现段错误

Open HuDi2018 opened this issue 3 years ago • 1 comments

推荐使用英语模板 Bug report,以便你的问题帮助更多人。

描述 bug

如题

复现流程

模块版本: torch==1.10.0+cpu torchvision==0.11.1 numpy==1.19.5

python3 dataloader_problem.py example.py

相关信息

  1. pip list | grep "mmcv\|mmcls\|^torch" 命令的输出 Package Version Editable project location

addict 2.4.0 autopep8 1.6.0 certifi 2021.10.8 charset-normalizer 2.0.11 click 7.1.2 colorama 0.4.4 cycler 0.11.0 dataclasses 0.8 flake8 4.0.1 idna 3.3 importlib-metadata 4.2.0 kiwisolver 1.3.1 Markdown 3.3.6 matplotlib 3.3.4 mccabe 0.6.1 mmcls 0.19.0
model-index 0.1.11 numpy 1.19.5 opencv-python 4.5.5.62 openmim 0.1.5 ordered-set 4.0.2 packaging 21.3 pandas 1.1.5 Pillow 8.4.0 pip 21.3.1 pkg_resources 0.0.0 poptorch 2.5.0+47014 pycodestyle 2.8.0 pyflakes 2.4.0 pyparsing 3.0.7 python-dateutil 2.8.2 pytz 2021.3 PyYAML 6.0 requests 2.27.1 setuptools 59.6.0 six 1.16.0 tabulate 0.8.9 toml 0.10.2 torch 1.10.0+cpu torchvision 0.11.1 tqdm 4.62.3 typing_extensions 4.0.1 urllib3 1.26.8 wheel 0.37.1 yapf 0.32.0 zipp 3.6.0 2. 如果你修改了,或者使用了新的配置文件,请在这里写明

_base_ = [
    '../_base_/datasets/imagenet_bs64_swin_224.py',
    '../_base_/schedules/imagenet_bs1024_adamw_swin.py',
    '../_base_/default_runtime.py'
]

paramwise_cfg = dict(
    norm_decay_mult=0.0,
    bias_decay_mult=0.0,
    custom_keys={
        '.cls_token': dict(decay_mult=0.0),
        '.pos_embed': dict(decay_mult=0.0)
    })
optimizer = dict(paramwise_cfg=paramwise_cfg)

# data settings
data = dict(samples_per_gpu=1, workers_per_gpu=1)
  1. 如果你是在训练过程中遇到的问题,请填写完整的训练日志和报错信息 [填写这里]
  2. 如果你对 mmcls 文件夹下的代码做了其他相关的修改,请在这里写明 复现代码(dataloader_problem.py)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
import time
import warnings

import tqdm
import torch
from mmcv import Config, DictAction
from mmcls.apis import init_random_seed
from mmcv.runner import get_dist_info, init_dist

from mmcls import __version__
from mmcls.datasets import build_dataset
from mmcls.datasets import build_dataloader

import random
import numpy as np
random.seed(888)
np.random.seed(888)
torch.manual_seed(888)

class IterLoader:

    def __init__(self, dataloader):
        self._dataloader = dataloader
        self.iter_loader = iter(self._dataloader)
        self._epoch = 0

    @property
    def epoch(self):
        return self._epoch

    def __next__(self):
        try:
            data = next(self.iter_loader)
        except StopIteration:
            self._epoch += 1
            if hasattr(self._dataloader.sampler, 'set_epoch'):
                self._dataloader.sampler.set_epoch(self._epoch)
            time.sleep(2)  # Prevent possible deadlock during epoch transition
            self.iter_loader = iter(self._dataloader)
            data = next(self.iter_loader)

        return data

    def __len__(self):
        return len(self._dataloader)

def parse_args():
    parser = argparse.ArgumentParser(description='Train a model')
    parser.add_argument('config', help='train config file path')
    parser.add_argument('--work-dir', help='the dir to save logs and models')
    parser.add_argument(
        '--resume-from', help='the checkpoint file to resume from')
    parser.add_argument(
        '--no-validate',
        action='store_true',
        help='whether not to evaluate the checkpoint during training')
    group_gpus = parser.add_mutually_exclusive_group()
    group_gpus.add_argument('--device', help='device used for training')
    group_gpus.add_argument(
        '--gpus',
        type=int,
        help='number of gpus to use '
        '(only applicable to non-distributed training)')
    group_gpus.add_argument(
        '--gpu-ids',
        type=int,
        nargs='+',
        help='ids of gpus to use '
        '(only applicable to non-distributed training)')
    parser.add_argument('--seed', type=int, default=None, help='random seed')
    parser.add_argument(
        '--deterministic',
        action='store_true',
        help='whether to set deterministic options for CUDNN backend.')
    parser.add_argument(
        '--options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file (deprecate), '
        'change to --cfg-options instead.')
    parser.add_argument(
        '--cfg-options',
        nargs='+',
        action=DictAction,
        help='override some settings in the used config, the key-value pair '
        'in xxx=yyy format will be merged into config file. If the value to '
        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
        'Note that the quotation marks are necessary and that no white space '
        'is allowed.')
    parser.add_argument(
        '--launcher',
        choices=['none', 'pytorch', 'slurm', 'mpi'],
        default='none',
        help='job launcher')
    parser.add_argument('--local_rank', type=int, default=0)
    args = parser.parse_args()
    if 'LOCAL_RANK' not in os.environ:
        os.environ['LOCAL_RANK'] = str(args.local_rank)

    if args.options and args.cfg_options:
        raise ValueError(
            '--options and --cfg-options cannot be both '
            'specified, --options is deprecated in favor of --cfg-options')
    if args.options:
        warnings.warn('--options is deprecated in favor of --cfg-options')
        args.cfg_options = args.options

    return args


def main():
    args = parse_args()

    cfg = Config.fromfile(args.config)
    if args.cfg_options is not None:
        cfg.merge_from_dict(args.cfg_options)
    # set cudnn_benchmark
    if cfg.get('cudnn_benchmark', False):
        torch.backends.cudnn.benchmark = True

    # work_dir is determined in this priority: CLI > segment in file > filename
    if args.work_dir is not None:
        # update configs according to CLI args if args.work_dir is not None
        cfg.work_dir = args.work_dir
    elif cfg.get('work_dir', None) is None:
        # use config filename as default work_dir if cfg.work_dir is None
        cfg.work_dir = osp.join('./work_dirs',
                                osp.splitext(osp.basename(args.config))[0])
    if args.resume_from is not None:
        cfg.resume_from = args.resume_from
    if args.gpu_ids is not None:
        cfg.gpu_ids = args.gpu_ids
    else:
        cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)

    # init distributed env first, since logger depends on the dist info.
    if args.launcher == 'none':
        distributed = False
    else:
        distributed = True
        init_dist(args.launcher, **cfg.dist_params)
        _, world_size = get_dist_info()
        cfg.gpu_ids = range(world_size)

    datasets = build_dataset(cfg.data.train)

    sampler_cfg = cfg.data.get('sampler', None)

    seed = init_random_seed(args.seed)
    cfg.seed = seed

    data_loader = build_dataloader(
                        datasets,
                        1,
                        workers_per_gpu=0,
                        num_gpus=1,
                        dist=distributed,
                        round_up=True,
                        seed=cfg.seed,
                        persistent_workers=False,
                        sampler_cfg=sampler_cfg)
    
    iter_loader = IterLoader(data_loader)
    # data_batch = next(iter_loader)
    # data_loader_iter = iter(data_loader)
    for _ in tqdm.tqdm(range(len(iter_loader))):
        data = next(iter_loader)

if __name__ == '__main__':
    main()

附加内容

  0%|                                                                                                         | 6/1281167 [00:00<11:48:39, 30.13it/s]Fatal Python error: Segmentation fault

Thread 0x00007f39cc2ea700 (most recent call first):
  File "/usr/lib/python3.6/threading.py", line 299 in wait
  File "/usr/lib/python3.6/threading.py", line 551 in wait
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/tqdm/_monitor.py", line 60 in run
  File "/usr/lib/python3.6/threading.py", line 916 in _bootstrap_inner
  File "/usr/lib/python3.6/threading.py", line 884 in _bootstrap

Current thread 0x00007f3af8f4f740 (most recent call first):
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/numpy/lib/histograms.py", line 851 in histogram
  File "<__array_function__ internals>", line 6 in histogram
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/mmcv/image/photometric.py", line 260 in _auto_contrast_channel
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/mmcv/image/photometric.py", line 287 in auto_contrast
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/auto_augment.py", line 507 in __call__
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/compose.py", line 33 in __call__
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/auto_augment.py", line 224 in __call__
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/compose.py", line 33 in __call__
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/base_dataset.py", line 82 in prepare_data
  File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/base_dataset.py", line 88 in __getitem__
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 49 in <listcomp>
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 49 in fetch
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 561 in _next_data
  File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 521 in __next__
  File "dataloader_problem.py", line 38 in __next__
  File "dataloader_problem.py", line 174 in main
  File "dataloader_problem.py", line 177 in <module>
Segmentation fault

HuDi2018 avatar Feb 18 '22 09:02 HuDi2018

@ZwwWayne Any updates on this issue?

mzr1996 avatar Mar 07 '22 03:03 mzr1996

This issue will be closed as it is inactive, feel free to re-open it if necessary.

tonysy avatar Dec 12 '22 15:12 tonysy