QAT issue of yolox

Open ianWeichengZhang opened this issue 1 year ago • 0 comments

Hello, I am trying to make yolox QAT work. Following the tutorial, I first conducted PTQ quantization on the yolox model, which includes CLE and BC operations. After that, I used aimet_torch.quantsim.QuantizationSimModel.model to obtain a quantized model with the quant_op added, and replaced the original yolox.model in the yolox trainer. However, the forward function now returns a tensor, which is different from the tuple returned by the original yolox.model(). Could you please tell me how should I modify my code?

Error:

2024-01-29 05:45:16 | INFO     | yolox.core.trainer:344 - Training of experiment is done and the best AP is 0.00
2024-01-29 05:45:16 | ERROR    | yolox.core.launch:98 - An error has been caught in function 'launch', process 'MainProcess' (2285414), thread 'MainThread' (139938635503424):
Traceback (most recent call last):

  File "/workspace/workdir_trt/YOLOX/tools/train.py", line 146, in <module>
    launch(
    └ <function launch at 0x7f4406e45dc0>

> File "/workspace/workdir_trt/YOLOX/yolox/core/launch.py", line 98, in launch
    main_func(*args)
    │          └ (╒═══════════════════╤═══════════════════════════════════════════════════════════════════════════════════════════════════════...
    └ <function main at 0x7f4407be6820>

  File "/workspace/workdir_trt/YOLOX/tools/train.py", line 123, in main
    trainer.train()
    │       └ <function Trainer.train at 0x7f4407be6940>
    └ <yolox.core.trainer.Trainer object at 0x7f4407bec910>

  File "/workspace/workdir_trt/YOLOX/yolox/core/trainer.py", line 193, in train
    self.train_in_epoch()
    │    └ <function Trainer.train_in_epoch at 0x7f4407be69d0>
    └ <yolox.core.trainer.Trainer object at 0x7f4407bec910>

  File "/workspace/workdir_trt/YOLOX/yolox/core/trainer.py", line 202, in train_in_epoch
    self.train_in_iter()
    │    └ <function Trainer.train_in_iter at 0x7f4407be6a60>
    └ <yolox.core.trainer.Trainer object at 0x7f4407bec910>

  File "/workspace/workdir_trt/YOLOX/yolox/core/trainer.py", line 208, in train_in_iter
    self.train_one_iter()
    │    └ <function Trainer.train_one_iter at 0x7f4407be6af0>
    └ <yolox.core.trainer.Trainer object at 0x7f4407bec910>

  File "/workspace/workdir_trt/YOLOX/yolox/core/trainer.py", line 229, in train_one_iter
    loss = outputs["total_loss"]
           └ tensor([[[8.0200e+00, 9.7085e+00, 1.6745e+01,  ..., 7.9569e-03,
                       2.4659e-03, 5.6030e-03],
                      [1.2010e+01, 1.41...

IndexError: too many indices for tensor of dimension 3

Here is the code I modified.

in yolox.trainer: added:

def apply_cross_layer_equalization(model: torch.nn.Module, input_shape: tuple):
    """
        Applying CLE on the model inplace consists of:
            Batch Norm Folding
            Cross Layer Scaling
            High Bias Fold
            Converts any ReLU6 into ReLU.

        :param model: the loaded model
        :param input_shape: the shape of the input to the model
        :return:
    """

    equalize_model(model, input_shape)


def apply_bias_correction(model: torch.nn.Module, data_loader):
    """
        Applies Bias-Correction on the model.
        :param model: The model to quantize
        :param evaluator: Evaluator used during quantization
        :param dataloader: DataLoader used during quantization
        :param logdir: Log directory used for storing log files
        :return: None
    """
        # Rounding mode can be 'nearest' or 'stochastic'
    rounding_mode = 'nearest'

        # Number of samples used during quantization
    num_quant_samples = 16

        # Number of samples used for bias correction
    num_bias_correct_samples = 16

    params = QuantParams(weight_bw=8, act_bw=8, round_mode=rounding_mode, quant_scheme='tf_enhanced')

        # Perform Bias Correction
    bias_correction.correct_bias(model.to(device="cuda"), params, num_quant_samples=num_quant_samples,
                                 data_loader=data_loader, num_bias_correct_samples=num_bias_correct_samples)

def forward_pass(decoder, model, data_loader):
    """forward pass for compute encodings"""
    #pylint:disable = no-member
    tensor_type = torch.cuda.FloatTensor
    model = model.eval()

    for imgs, _, info_imgs, ids in tqdm(data_loader):
        with torch.no_grad():
            imgs = imgs.type(tensor_type)
            outputs = model(imgs)
            if decoder is not None:
                outputs = decoder(outputs, dtype=outputs.type())

def calculate_quantsim_accuracy(model: torch.nn.Module, evaluator: aimet_common.defs.EvalFunction, dataloader,
                                use_cuda: bool = False, logdir: str = '') -> Tuple[torch.nn.Module, float]:
    """
    Calculates model accuracy on quantized simulator and returns quantized model with accuracy.

    :param model: the loaded model
    :param evaluator: the Eval function to use for evaluation
    :param iterations: No of batches to use in computing encodings.
                       Not used in image net dataset
    :param num_val_samples_per_class: No of samples to use from every class in
                                      computing encodings. Not used in pascal voc
                                      dataset
    :param use_cuda: the cuda device.
    :return: a tuple of quantsim and accuracy of model on this quantsim
    """

    input_shape = (1, 3, 640, 640)
    if use_cuda:
        model.to(torch.device('cuda'))
        dummy_input = torch.rand(input_shape).cuda()
    else:
        dummy_input = torch.rand(input_shape)

    # apply_cross_layer_equalization(model,input_shape)
    # apply_bias_correction(model,dataloader)

    # Number of batches to use for computing encodings
    # Only 5 batches are used here to speed up the process, also the
    # number of images in these 5 batches should be sufficient for
    # compute encodings
    iterations = 5

    net_dataloader = get_data_loader(
        dataset_path='/workspace/datasets',
        img_size=(640,640),
        batch_size=64,
        num_workers=4,
    )

    quantsim = QuantizationSimModel(model=model, quant_scheme='tf_enhanced',
                                    dummy_input=dummy_input, rounding_mode='nearest',
                                    default_output_bw=8, default_param_bw=8, in_place=False)

    forward_func = partial(forward_pass, None)

    quantsim.compute_encodings(forward_func, forward_pass_callback_args=net_dataloader)

    #accuracy = evaluator.evaluate(quantsim.model)

    return quantsim, accuracy

modified

def before_train(self):
        logger.info("args: {}".format(self.args))
        logger.info("exp value:\n{}".format(self.exp))

        # model related init
        torch.cuda.set_device(self.local_rank)
        model = self.exp.get_model()
        logger.info(
            "Model Summary: {}".format(get_model_info(model, self.exp.test_size))
        )
        model.to(self.device)

        # solver related init
        self.optimizer = self.exp.get_optimizer(self.args.batch_size)

        # value of epoch will be set in `resume_train`
        model = self.resume_train(model)


        self.evaluator = self.exp.get_evaluator(
            batch_size=self.args.batch_size, is_distributed=self.is_distributed
        )
        
        # data related init
        self.no_aug = self.start_epoch >= self.max_epoch - self.exp.no_aug_epochs
        self.train_loader = self.exp.get_data_loader(
            batch_size=self.args.batch_size,
            is_distributed=self.is_distributed,
            no_aug=self.no_aug,
            cache_img=self.args.cache,
        )
        logger.info("init prefetcher, this might take one minute or less...")
        self.prefetcher = DataPrefetcher(self.train_loader)
        # max_iter means iters per epoch
        self.max_iter = len(self.train_loader)
        # logger.info("PTQ complete, map50 accuracy:{}".format(map50))
        
        quantsim, map50 = calculate_quantsim_accuracy(model=model, 
                                                  evaluator=self.evaluator,
                                                  dataloader=self.train_loader,
                                                  use_cuda=True,
                                                  logdir=''
                                                  )

        model = quantsim.model

        

        self.lr_scheduler = self.exp.get_lr_scheduler(
            self.exp.basic_lr_per_img * self.args.batch_size, self.max_iter
        )
        if self.args.occupy:
            occupy_mem(self.local_rank)

        if self.is_distributed:
            model = DDP(model, device_ids=[self.local_rank], broadcast_buffers=False)

        if self.use_model_ema:
            self.ema_model = ModelEMA(model, 0.9998)
            self.ema_model.updates = self.max_iter * self.start_epoch 
        
        
        
        

        self.model = model

        
        # Tensorboard and Wandb loggers
        if self.rank == 0:
            if self.args.logger == "tensorboard":
                self.tblogger = SummaryWriter(os.path.join(self.file_name, "tensorboard"))
            elif self.args.logger == "wandb":
                self.wandb_logger = WandbLogger.initialize_wandb_logger(
                    self.args,
                    self.exp,
                    self.evaluator.dataloader.dataset
                )
            else:
                raise ValueError("logger must be either 'tensorboard' or 'wandb'")

        logger.info("Training start...")
        logger.info("\n{}".format(model))

Jan 29 '24 06:01 ianWeichengZhang