pytorch-distributed
pytorch-distributed copied to clipboard
大佬,请问如何指定gpu训练
例如,我在一张8卡节点上训练,想用其中4张训练 如果我用0,1,2,3是可以训练的 但是如果我用 其他任意组合的gpuid就不可以
我参考了这个把每个进程的gpuid 改了 https://github.com/PyTorchLightning/pytorch-lightning/issues/2407
会提示
RuntimeError: cuda runtime error (10) : invalid device ordinal at /pytorch/torch/csrc/cuda/Module.cpp:59
我的代码
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.utils.data.distributed
import torch.multiprocessing as mp
import argparse
import os
parser = argparse.ArgumentParser(description = 'multi process')
parser.add_argument('--gpu-id',type =str,default='0,1,2,4')
parser.add_argument('--world-size', default=1, type=int,
help='number of nodes for distributed training')
parser.add_argument('--rank', default=0, type=int,
help='node rank for distributed training')
parser.add_argument('--dist-url', default='tcp://localhost:23456', type=str,
help='url used to set up distributed training')
parser.add_argument('--dist-backend', default='nccl', type=str,
help='distributed backend')
args = parser.parse_args()
def main():
global args
os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu_id
# args.gpu = list(map(int,args.gpu_id.split(',')))
# state = {k: v for k, v in args._get_kwargs()}
# ngpus_per_node = torch.cuda.device_count() #len(args.gpu)
ngpus_per_node = args.gpu_id.split(',').__len__()
# print(os.environ['CUDA_VISIBLE_DEVICES'])
# print('能看到的gpu',ngpus_per_node)
args.nprocs = ngpus_per_node
args.world_size = ngpus_per_node * args.world_size
mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
# Random seed
# best_acc = 0 # best test accuracy
def main_worker(local_rank,ngpus_per_node,args):
# global best_acc
# start from epoch 0 or last checkpoint epoch
# if not os.path.isdir(args.checkpoint):
# mkdir_p(args.checkpoint)
# # import pdb
# pdb.set_trace()
gpus = os.environ['CUDA_VISIBLE_DEVICES'].split(',')
gpu = int(gpus[local_rank])
args.gpu = gpu
best_acc = 0
# print(best_acc)
args.rank = args.rank * ngpus_per_node + local_rank#args.gpu[gpu]
print('rank: {} / {}'.format(args.rank, args.world_size))
dist.init_process_group(backend=args.dist_backend,
init_method=args.dist_url,
world_size=args.world_size,
rank=args.rank)
torch.cuda.set_device(gpu)
if __name__ == '__main__':
main()`