FasterNet icon indicating copy to clipboard operation
FasterNet copied to clipboard

slicing slower than split_cat

Open wsy-yjys opened this issue 2 years ago • 6 comments

Hi,I use the following code test the fps in “slicing” and “split_cat” pattern on RTX 2060(a total of 10 rounds of testing, each round first warm up 20 times, then infer 100 times, the final result is the average of the 10*100 times of infer time), the results are shown below, unexpectedly found that “slicing” was slower than “split_cat”, Have you ever had this problem? Thankyou~

split_cat:{'fps': 246.7, 'time_mean': 4.1, 'time_std': 0.2}
slicing:{'fps': 227.0, 'time_mean': 4.4, 'time_std': 0.3}
from argparse import ArgumentParser
from utils.utils import *
from utils.fuse_conv_bn import fuse_conv_bn
from data.data_api import LitDataModule
from models.model_api import LitModel

def fps_mm(model, repetitions, num_warmup, infer_epoch):
   # 加载模型
   device = torch.device("cuda:0")
   model.to(device)
   model.eval()
   torch.backends.cudnn.benchmark = True
   # 初始化图像
   data = torch.randn(1, 3, 224, 224, dtype=torch.float).to(device)

   result_average = {'fps': 0, 'time_mean': 0, 'time_std': 0}

   for _ in range(infer_epoch):
       result = {}
       infer_time = []

       for i in range(repetitions):
           torch.cuda.synchronize()
           start_time = time.perf_counter()

           # infer
           with torch.no_grad():
               model(data)

           torch.cuda.synchronize()
           elapsed = (time.perf_counter() - start_time)

           if i >= num_warmup:
               infer_time.append(elapsed)

       result['fps'] = (repetitions - num_warmup) / sum(infer_time)
       result['time_mean'] = np.mean(infer_time) * 1000
       result['time_std'] = np.std(infer_time) * 1000

       result_average['fps'] += result['fps']
       result_average['time_mean'] += result['time_mean']
       result_average['time_std'] += result['time_std']

       for key, value in result.items():
           result[key] = round(value, 1)

       print(result)

   for key, value in result_average.items():
       result_average[key] = round(value / infer_epoch, 1)
   print("result_average:")
   print(result_average)


def main(args):
    # Init data pipeline
    dm, _ = LitDataModule(hparams=args)

    # Init LitModel
    if args.checkpoint_path is not None:
        PATH = args.checkpoint_path
        if PATH[-5:]=='.ckpt':
            model = LitModel.load_from_checkpoint(PATH, map_location='cpu', num_classes=dm.num_classes, hparams=args)
            print('Successfully load the pl checkpoint file.')
            if args.pl_ckpt_2_torch_pth:
                device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
                model = model.model.to(device)
                torch.save(model.state_dict(), PATH[:-5]+'.pth')
                exit()
        elif PATH[-4:] == '.pth':
            model = LitModel(num_classes=dm.num_classes, hparams=args)
            missing_keys, unexpected_keys = model.model.load_state_dict(torch.load(PATH), False)
            # show for debug
            print('missing_keys: ', missing_keys)
            print('unexpected_keys: ', unexpected_keys)
        else:
            raise TypeError
    else:
        model = LitModel(num_classes=dm.num_classes, hparams=args)

    if args.fuse_conv_bn:
        fuse_conv_bn(model.model)

    if args.measure_latency:
        model = model.model
        fps_mm(model, repetitions=120, num_warmup=20, infer_epoch=10)

if __name__ == "__main__":
    parser = ArgumentParser()
    parser.add_argument('-c', '--cfg', type=str, default='cfg/fasternet_t0.yaml')
    parser.add_argument('-g', "--gpus", type=str, default="0",
                        help="Number of GPUs to train on (int) or which GPUs to train on (list or str) applied per node.")
    parser.add_argument('-d', "--dev", type=int, default=0, help='fast_dev_run for debug')
    parser.add_argument("--num_nodes", type=int, default=1)
    parser.add_argument('-n', "--num_workers", type=int, default=4)
    parser.add_argument('-b', "--batch_size", type=int, default=2048)
    parser.add_argument('-e', "--batch_size_eva", type=int, default=1, help='batch_size for evaluation')
    parser.add_argument("--model_ckpt_dir", type=str, default="./model_ckpt/")
    parser.add_argument("--data_dir", type=str, default="../../data/imagenet")
    parser.add_argument('--pin_memory', action='store_true')
    parser.add_argument("--checkpoint_path", type=str, default=None)
    parser.add_argument("--pconv_fw_type", type=str, default='slicing',
                        help="use 'split_cat' for training/inference and 'slicing' only for inference")
    parser.add_argument('--measure_latency', action='store_true', help='measure latency or throughput')
    parser.add_argument('--test_phase', action='store_true')
    parser.add_argument('--fuse_conv_bn', action='store_true')
    parser.add_argument("--wandb_project_name", type=str, default="fasternet")
    parser.add_argument('--wandb_offline', action='store_true')
    parser.add_argument('--wandb_save_dir', type=str, default='./')
    parser.add_argument('--pl_ckpt_2_torch_pth', action='store_true',
                        help='convert pl .ckpt file to torch .pth file, and then exit')

    args = parser.parse_args()
    cfg = load_cfg(args.cfg)
    args = merge_args_cfg(args, cfg)

    # please change {WANDB_API_KEY} to your personal api_key before using wandb
    # os.environ["WANDB_API_KEY"] = "{WANDB_API_KEY}"

    main(args)

wsy-yjys avatar May 05 '23 03:05 wsy-yjys

@wsy-yjys Hi, the slicing mode can be slower because of the feature map clone (see the code x = x.clone() ). Such clone is necessary to avoid modifying the input, which will be used for shortcut addition. However, if the shortcut is not placed exactly before the PConv, the slicing mode can be faster by removing the input clone.

JierunChen avatar May 05 '23 10:05 JierunChen

Oh……,I got it, thank you~

wsy-yjys avatar May 08 '23 08:05 wsy-yjys

Hi, I rethink your code implementation, why you not change shortcut = x to shortcut = x.clone(). Thus, we can comment out x = x.clone() in Partial conv3 ?

image

image

wsy-yjys avatar May 08 '23 12:05 wsy-yjys

Good question, waiting for reply

yangshaobo0634 avatar May 12 '23 02:05 yangshaobo0634

@wsy-yjys Hi, shortcut = x is a shallow copy, which can be much faster than the deep copy version of x = x.clone(). Therefore, the implementation as you suggested would:

  • Speed up the inference a bit or negligibly for the slicing mode.
  • Slow down the inference (maybe considerably) for the concatenation mode.

JierunChen avatar May 19 '23 10:05 JierunChen

@JierunChen Hi, thank you for your reply, I see. You are very responsible.

wsy-yjys avatar May 21 '23 13:05 wsy-yjys