[BUG]: assert self.l2_norm is None, "you are calculating the l2 norm twice"
🐛 Describe the bug
import argparse import copy import logging import os import time
import numpy as np import torch import torch.nn as nn import torch.nn.functional as F try: from apex import amp except ImportError: print("apex库不存在")
from utils import (upper_limit, lower_limit, std, clamp, get_loaders, attack_pgd, evaluate_pgd, evaluate_pgd_8, evaluate_standard)
import colossalai from colossalai.utils import get_current_device from colossalai.zero import ColoInitContext, GeminiAdamOptimizer from colossalai.zero.gemini import get_static_torch_model
import timm logger = logging.getLogger(name)
def get_args(): parser = argparse.ArgumentParser() parser.add_argument('--net', default='resnet50', type=str) parser.add_argument('--batch-size', default=106, type=int) parser.add_argument('--data-dir', default='../../cifar-data', type=str) parser.add_argument('--epochs', default=15, type=int) parser.add_argument('--lr-schedule', default='multistep', choices=['cyclic', 'multistep']) parser.add_argument('--lr-min', default=0., type=float) parser.add_argument('--lr-max', default=1e-4, type=float) parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") parser.add_argument('--weight-decay', default=1e-3, type=float) parser.add_argument('--momentum', default=0.9, type=float) parser.add_argument('--epsilon', default=8, type=int) parser.add_argument( "--placement", type=str, default="cpu", help="Placement Policy for Gemini. Valid when using colossalai as dist plan.", ) parser.add_argument('--alpha', default=10, type=float, help='Step size') parser.add_argument('--delta-init', default='previous', choices=['zero', 'random', 'previous'], help='Perturbation initialization method') parser.add_argument('--out-dir', default='train_fgsm_output', type=str, help='Output directory') parser.add_argument('--seed', default=0, type=int, help='Random seed') parser.add_argument('--early-stop', action='store_true', help='Early stop if overfitting occurs') parser.add_argument('--opt-level', default='O0', type=str, choices=['O0', 'O1', 'O2'], help='O0 is FP32 training, O1 is Mixed Precision, and O2 is "Almost FP16" Mixed Precision') parser.add_argument('--loss-scale', default='1.0', type=str, choices=['1.0', 'dynamic'], help='If loss_scale is "dynamic", adaptively adjust the loss scale over time') parser.add_argument('--master-weights', action='store_true', help='Maintain FP32 master weights to accompany any FP16 model weights, not applicable for O1 opt level') return parser.parse_args()
Gemini + ZeRO DDP
def gemini_zero_dpp(model: torch.nn.Module, placememt_policy: str = "cpu"): from colossalai.zero.gemini import GeminiDDP
model = GeminiDDP(model,
device=get_current_device(),
force_outputs_fp32=True,
placement_policy=placememt_policy,
pin_memory=True,
search_range_mb=64)
return model
def main(): args = get_args() #colossalai.launch_from_torch(config={}, seed=args.seed) # launch distributed environment colossalai.launch(config={}, rank = 0, #args.rank, world_size = 1, #args.world_size, host =0,# args.host, port = 2098,#args.port, backend = 'nccl'#args.backend )
if not os.path.exists(args.out_dir):
os.mkdir(args.out_dir)
logfile = os.path.join(args.out_dir, 'output.log')
if os.path.exists(logfile):
os.remove(logfile)
logging.basicConfig(
format='[%(asctime)s] - %(message)s',
datefmt='%Y/%m/%d %H:%M:%S',
level=logging.DEBUG,
filename=os.path.join(args.out_dir, 'output.log'))
logger.info(args)
np.random.seed(args.seed)
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
train_loader, test_loader = get_loaders(args.data_dir, args.batch_size)
epsilon = (args.epsilon / 255.) / std
alpha = (args.alpha / 255.) / std
pgd_alpha = (2 / 255.) / std
with ColoInitContext(device=get_current_device()):
model = timm.create_model(args.net, pretrained=False, num_classes=100)
model.train()
model.load_state_dict(torch.load('resnet50-190-regular.pth'))
#opt = torch.optim.SGD(model.parameters(), lr=args.lr_max, momentum=args.momentum, weight_decay=args.weight_decay)
# opt = torch.optim.AdamW(params=filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr_max,
# weight_decay=args.weight_decay)
# config optimizer for colossalai zero
model = gemini_zero_dpp(model, args.placement)
optimizer = GeminiAdamOptimizer(model, lr=args.lr_max, initial_scale=2)
amp_args = dict(opt_level=args.opt_level, loss_scale=args.loss_scale, verbosity=False)
if args.opt_level == 'O2':
amp_args['master_weights'] = args.master_weights
model, opt = amp.initialize(model, opt, **amp_args)
criterion = nn.CrossEntropyLoss()
if args.delta_init == 'previous':
delta = torch.zeros(args.batch_size, 3, 224, 224).cuda()
lr_steps = args.epochs * len(train_loader)
if args.lr_schedule == 'cyclic':
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=args.lr_min, max_lr=args.lr_max,
step_size_up=lr_steps / 2, step_size_down=lr_steps / 2)
elif args.lr_schedule == 'multistep':
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[lr_steps / 2, lr_steps * 3 / 4], gamma=0.1)
# Training
prev_robust_acc = 0.
start_train_time = time.time()
logger.info('start_traing')
#logger.info('Epoch \t Seconds \t LR \t \t Train Loss \t Train Acc')
for epoch in range(args.epochs):
model.train()
start_epoch_time = time.time()
train_loss = 0
train_acc = 0
train_n = 0
for i, (X, y) in enumerate(train_loader):
print('##################')
X, y = X.cuda(), y.cuda()
if i == 0:
first_batch = (X, y)
if args.delta_init != 'previous':
delta = torch.zeros_like(X).cuda()
if args.delta_init == 'random':
for j in range(len(epsilon)):
delta[:, j, :, :].uniform_(-epsilon[j][0][0].item(), epsilon[j][0][0].item())
delta.data = clamp(delta, lower_limit - X, upper_limit - X)
delta.requires_grad = True
output = model(X + delta[:X.size(0)])
loss = F.cross_entropy(output, y)
if args.opt_level == 'O2':
with amp.scale_loss(loss, opt) as scaled_loss:
scaled_loss.backward()
else:
optimizer.backward(loss)
grad = delta.grad.detach()
delta.data = clamp(delta + alpha.cuda() * torch.sign(grad), -epsilon.cuda(), epsilon.cuda())
delta.data[:X.size(0)] = clamp(delta[:X.size(0)], lower_limit.cuda() - X, upper_limit.cuda() - X)
delta = delta.detach()
output = model(X + delta[:X.size(0)])
loss = criterion(output, y)
optimizer.zero_grad()
if args.opt_level == 'O2':
with amp.scale_loss(loss, opt) as scaled_loss:
scaled_loss.backward()
else:
optimizer.backward(loss)
optimizer.step()
train_loss += loss.item() * y.size(0)
train_acc += (output.max(1)[1] == y).sum().item()
train_n += y.size(0)
scheduler.step()
if args.early_stop:
# Check current PGD robustness of model using random minibatch
X, y = first_batch
pgd_delta = attack_pgd(model, X, y, epsilon, pgd_alpha, 5, 1, opt)
with torch.no_grad():
output = model(clamp(X + pgd_delta[:X.size(0)], lower_limit, upper_limit))
robust_acc = (output.max(1)[1] == y).sum().item() / y.size(0)
if robust_acc - prev_robust_acc < -0.2:
break
prev_robust_acc = robust_acc
best_state_dict = copy.deepcopy(model.state_dict())
epoch_time = time.time()
lr = scheduler.get_lr()[0]
logger.info('Epoch \t Seconds \t LR \t \t Train Loss \t Train Acc')
logger.info('%d \t %.1f \t \t %.4f \t %.4f \t %.4f',
epoch, epoch_time - start_epoch_time, lr, train_loss/train_n, train_acc/train_n)
pgd_loss, pgd_acc = evaluate_pgd(test_loader, model, 20, 1)
pgd_loss_8, pgd_acc_8 = evaluate_pgd_8(test_loader, model, 20, 1)
test_loss, test_acc = evaluate_standard(test_loader, model)
logger.info('Test Loss \t Test Acc \t PGD Loss \t PGD Acc \t PGD Loss_8 \t PGD Acc_8')
logger.info('%.4f \t \t %.4f \t %.4f \t %.4f\t %.4f \t %.4f', test_loss, test_acc, pgd_loss, pgd_acc, pgd_loss_8, pgd_acc_8)
train_time = time.time()
if not args.early_stop:
best_state_dict = model.state_dict()
torch.save(best_state_dict, os.path.join(args.out_dir, 'model.pth'))
logger.info('Total train time: %.4f minutes', (train_time - start_train_time)/60)
# Evaluation
model_test = timm.create_model(args.net, pretrained=False, num_classes=100).cuda()
model_test.load_state_dict(best_state_dict)
model_test.float()
model_test.eval()
pgd_loss, pgd_acc = evaluate_pgd(test_loader, model_test, 20, 1)
pgd_loss_8, pgd_acc_8 = evaluate_pgd_8(test_loader, model_test, 20, 1)
test_loss, test_acc = evaluate_standard(test_loader, model_test)
logger.info('Test Loss \t Test Acc \t PGD Loss \t PGD Acc \t PGD_8 Loss \t PGD_8 Acc')
logger.info('%.4f \t \t %.4f \t %.4f \t %.4f \t %.4f \t %.4f', test_loss, test_acc, pgd_loss, pgd_acc, pgd_loss_8, pgd_acc_8)
if name == "main": main()
Environment
如果增加梯度裁切clipping_norm=1.0:optimizer = GeminiAdamOptimizer(model, lr=args.lr_max, initial_scale=1, clipping_norm=1.0)就会报错: File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/colossalai/zero/gemini/gemini_optimizer.py", line 247, in backward self.module.backward(loss) File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/colossalai/zero/gemini/gemini_ddp.py", line 164, in backward loss.backward() File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/torch/_tensor.py", line 388, in backward return handle_torch_function( File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/torch/overrides.py", line 1498, in handle_torch_function result = torch_func_method(public_api, types, args, kwargs) File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/colossalai/tensor/colo_tensor.py", line 190, in torch_function return backward_tensor.backward(**tensor_kwargs) File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/torch/_tensor.py", line 396, in backward torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs) File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/torch/autograd/init.py", line 173, in backward Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/colossalai/zero/gemini/gemini_ddp.py", line 192, in grad_handle chunk.set_l2_norm() File "/home/lym/miniconda3/envs/lab3/lib/python3.9/site-packages/colossalai/zero/gemini/chunk/chunk.py", line 242, in set_l2_norm assert self.l2_norm is None, "you are calculating the l2 norm twice" AssertionError: you are calculating the l2 norm twice
Hi @LYMDLUT
Though you did not use gradient accumulation explicitly, your code depends on GA actually. Gemini is not compatible with gradient accumulation.
BTW, I don't know whether your code is correct. Acctually, you calculate the loss twice in a step. But you only record the last loss. If you only want to update your input by the gradient, you should set the gradient of the model to None.
You can tell us the purpose of your code in detail. Then, let's see how to fix this bug.
我基于一个最基础的对抗训练的代码,想要试用本框架,我并没有使用梯度累积
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
I am based on one of the most basic confrontation training codes, and I want to try this framework, I do not use gradient accumulation
GA我应该如何修复呢?
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
GA How should I fix it?
因为对抗训练会进行两次梯度反传,第一次是反传给图片,第二次是反传给模型。似乎不兼容,这样的话怎么处理呢,我非常希望能够使用colossal ai辅助我的工作,谢谢
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
Because the confrontation training will perform gradient back-passing twice, the first time is back-passing the picture, and the second time is back-passing the model. It seems incompatible, how to deal with it, I very much hope to use colossal ai to assist my work, thank you
Hi @LYMDLUT
Though you did not use gradient accumulation explicitly, your code depends on GA actually. Gemini is not compatible with gradient accumulation.
BTW, I don't know whether your code is correct. Acctually, you calculate the loss twice in a step. But you only record the last loss. If you only want to update your input by the gradient, you should set the gradient of the model to None.
You can tell us the purpose of your code in detail. Then, let's see how to fix this bug. 因为对抗训练会进行两次梯度反传,第一次是反传给图片,第二次是反传给模型。似乎不兼容,这样的话怎么处理呢,我非常希望能够使用colossal ai辅助我的工作,谢谢
@LYMDLUT
Currently, Gemini is not compatible with gradient accumulation. I think you can try ZeRO2 instead.
这不是梯度累计啊,这就是用了两次loss反传。。。。。,第一次和第二次没关系啊。第一次更新图片,不更新model
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
This is not gradient accumulation, this is the use of two loss backpropagation. . . . . , It doesn't matter the first time and the second time. Update the picture for the first time without updating the model
@LYMDLUT
Currently, Gemini is not compatible with gradient accumulation. I think you can try ZeRO2 instead.
有没有和optimazer无关的loss.backward()呀,我只需要反传到图片的梯度。。。
deepspeed能解决这个问题吗
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
Can deepspeed solve this problem
https://github.com/hpcaitech/ColossalAI/blob/f313babd11f8137c2496e7dc54c6b61604cd3672/colossalai/zero/low_level/low_level_optim.py#L56 colossai ai的zero2支持cpu offload了吗?
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
https://github.com/hpcaitech/ColossalAI/blob/f313babd11f8137c2496e7dc54c6b61604cd3672/colossalai/zero/low_level/low_level_optim.py#L56 Does colossai ai's zero2 support cpu offload?
Bot detected the issue body's language is not English, translate it automatically. 👯👭🏻🧑🤝🧑👫🧑🏿🤝🧑🏻👩🏾🤝👨🏿👬🏿
https://github.com/hpcaitech/ColossalAI/blob/f313babd11f8137c2496e7dc54c6b61604cd3672/colossalai/zero/low_level/low_level_optim.py#L56
Does colossai ai's zero2 support cpu offload?
Yes, You can refer to https://github.com/hpcaitech/ColossalAI/blob/f313babd11f8137c2496e7dc54c6b61604cd3672/colossalai/zero/low_level/low_level_optim.py#L50