mmpretrain
mmpretrain copied to clipboard
[Bug]在cpu环境中,dataloder会出现段错误
推荐使用英语模板 Bug report,以便你的问题帮助更多人。
描述 bug
如题
复现流程
模块版本: torch==1.10.0+cpu torchvision==0.11.1 numpy==1.19.5
python3 dataloader_problem.py example.py
相关信息
pip list | grep "mmcv\|mmcls\|^torch"命令的输出 Package Version Editable project location
addict 2.4.0
autopep8 1.6.0
certifi 2021.10.8
charset-normalizer 2.0.11
click 7.1.2
colorama 0.4.4
cycler 0.11.0
dataclasses 0.8
flake8 4.0.1
idna 3.3
importlib-metadata 4.2.0
kiwisolver 1.3.1
Markdown 3.3.6
matplotlib 3.3.4
mccabe 0.6.1
mmcls 0.19.0
model-index 0.1.11
numpy 1.19.5
opencv-python 4.5.5.62
openmim 0.1.5
ordered-set 4.0.2
packaging 21.3
pandas 1.1.5
Pillow 8.4.0
pip 21.3.1
pkg_resources 0.0.0
poptorch 2.5.0+47014
pycodestyle 2.8.0
pyflakes 2.4.0
pyparsing 3.0.7
python-dateutil 2.8.2
pytz 2021.3
PyYAML 6.0
requests 2.27.1
setuptools 59.6.0
six 1.16.0
tabulate 0.8.9
toml 0.10.2
torch 1.10.0+cpu
torchvision 0.11.1
tqdm 4.62.3
typing_extensions 4.0.1
urllib3 1.26.8
wheel 0.37.1
yapf 0.32.0
zipp 3.6.0
2. 如果你修改了,或者使用了新的配置文件,请在这里写明
_base_ = [
'../_base_/datasets/imagenet_bs64_swin_224.py',
'../_base_/schedules/imagenet_bs1024_adamw_swin.py',
'../_base_/default_runtime.py'
]
paramwise_cfg = dict(
norm_decay_mult=0.0,
bias_decay_mult=0.0,
custom_keys={
'.cls_token': dict(decay_mult=0.0),
'.pos_embed': dict(decay_mult=0.0)
})
optimizer = dict(paramwise_cfg=paramwise_cfg)
# data settings
data = dict(samples_per_gpu=1, workers_per_gpu=1)
- 如果你是在训练过程中遇到的问题,请填写完整的训练日志和报错信息 [填写这里]
- 如果你对
mmcls文件夹下的代码做了其他相关的修改,请在这里写明 复现代码(dataloader_problem.py)
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
import time
import warnings
import tqdm
import torch
from mmcv import Config, DictAction
from mmcls.apis import init_random_seed
from mmcv.runner import get_dist_info, init_dist
from mmcls import __version__
from mmcls.datasets import build_dataset
from mmcls.datasets import build_dataloader
import random
import numpy as np
random.seed(888)
np.random.seed(888)
torch.manual_seed(888)
class IterLoader:
def __init__(self, dataloader):
self._dataloader = dataloader
self.iter_loader = iter(self._dataloader)
self._epoch = 0
@property
def epoch(self):
return self._epoch
def __next__(self):
try:
data = next(self.iter_loader)
except StopIteration:
self._epoch += 1
if hasattr(self._dataloader.sampler, 'set_epoch'):
self._dataloader.sampler.set_epoch(self._epoch)
time.sleep(2) # Prevent possible deadlock during epoch transition
self.iter_loader = iter(self._dataloader)
data = next(self.iter_loader)
return data
def __len__(self):
return len(self._dataloader)
def parse_args():
parser = argparse.ArgumentParser(description='Train a model')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--resume-from', help='the checkpoint file to resume from')
parser.add_argument(
'--no-validate',
action='store_true',
help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument('--device', help='device used for training')
group_gpus.add_argument(
'--gpus',
type=int,
help='number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='ids of gpus to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file (deprecate), '
'change to --cfg-options instead.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
if args.options and args.cfg_options:
raise ValueError(
'--options and --cfg-options cannot be both '
'specified, --options is deprecated in favor of --cfg-options')
if args.options:
warnings.warn('--options is deprecated in favor of --cfg-options')
args.cfg_options = args.options
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids
else:
cfg.gpu_ids = range(1) if args.gpus is None else range(args.gpus)
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
datasets = build_dataset(cfg.data.train)
sampler_cfg = cfg.data.get('sampler', None)
seed = init_random_seed(args.seed)
cfg.seed = seed
data_loader = build_dataloader(
datasets,
1,
workers_per_gpu=0,
num_gpus=1,
dist=distributed,
round_up=True,
seed=cfg.seed,
persistent_workers=False,
sampler_cfg=sampler_cfg)
iter_loader = IterLoader(data_loader)
# data_batch = next(iter_loader)
# data_loader_iter = iter(data_loader)
for _ in tqdm.tqdm(range(len(iter_loader))):
data = next(iter_loader)
if __name__ == '__main__':
main()
附加内容
0%| | 6/1281167 [00:00<11:48:39, 30.13it/s]Fatal Python error: Segmentation fault
Thread 0x00007f39cc2ea700 (most recent call first):
File "/usr/lib/python3.6/threading.py", line 299 in wait
File "/usr/lib/python3.6/threading.py", line 551 in wait
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/tqdm/_monitor.py", line 60 in run
File "/usr/lib/python3.6/threading.py", line 916 in _bootstrap_inner
File "/usr/lib/python3.6/threading.py", line 884 in _bootstrap
Current thread 0x00007f3af8f4f740 (most recent call first):
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/numpy/lib/histograms.py", line 851 in histogram
File "<__array_function__ internals>", line 6 in histogram
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/mmcv/image/photometric.py", line 260 in _auto_contrast_channel
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/mmcv/image/photometric.py", line 287 in auto_contrast
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/auto_augment.py", line 507 in __call__
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/compose.py", line 33 in __call__
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/auto_augment.py", line 224 in __call__
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/pipelines/compose.py", line 33 in __call__
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/base_dataset.py", line 82 in prepare_data
File "/localdata/cn-customer-engineering/hudi/mmclassification/mmcls/datasets/base_dataset.py", line 88 in __getitem__
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 49 in <listcomp>
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/_utils/fetch.py", line 49 in fetch
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 561 in _next_data
File "/localdata/cn-customer-engineering/hudi/virtual_env/poplar_sdk-ubuntu_18_04-2.5.0-EA.1+891-e60c1f84e8/2.5.0-EA.1+891_poptorch/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 521 in __next__
File "dataloader_problem.py", line 38 in __next__
File "dataloader_problem.py", line 174 in main
File "dataloader_problem.py", line 177 in <module>
Segmentation fault
@ZwwWayne Any updates on this issue?
This issue will be closed as it is inactive, feel free to re-open it if necessary.