slicing slower than split_cat
Hi,I use the following code test the fps in “slicing” and “split_cat” pattern on RTX 2060(a total of 10 rounds of testing, each round first warm up 20 times, then infer 100 times, the final result is the average of the 10*100 times of infer time), the results are shown below, unexpectedly found that “slicing” was slower than “split_cat”, Have you ever had this problem? Thankyou~
split_cat:{'fps': 246.7, 'time_mean': 4.1, 'time_std': 0.2}
slicing:{'fps': 227.0, 'time_mean': 4.4, 'time_std': 0.3}
from argparse import ArgumentParser
from utils.utils import *
from utils.fuse_conv_bn import fuse_conv_bn
from data.data_api import LitDataModule
from models.model_api import LitModel
def fps_mm(model, repetitions, num_warmup, infer_epoch):
# 加载模型
device = torch.device("cuda:0")
model.to(device)
model.eval()
torch.backends.cudnn.benchmark = True
# 初始化图像
data = torch.randn(1, 3, 224, 224, dtype=torch.float).to(device)
result_average = {'fps': 0, 'time_mean': 0, 'time_std': 0}
for _ in range(infer_epoch):
result = {}
infer_time = []
for i in range(repetitions):
torch.cuda.synchronize()
start_time = time.perf_counter()
# infer
with torch.no_grad():
model(data)
torch.cuda.synchronize()
elapsed = (time.perf_counter() - start_time)
if i >= num_warmup:
infer_time.append(elapsed)
result['fps'] = (repetitions - num_warmup) / sum(infer_time)
result['time_mean'] = np.mean(infer_time) * 1000
result['time_std'] = np.std(infer_time) * 1000
result_average['fps'] += result['fps']
result_average['time_mean'] += result['time_mean']
result_average['time_std'] += result['time_std']
for key, value in result.items():
result[key] = round(value, 1)
print(result)
for key, value in result_average.items():
result_average[key] = round(value / infer_epoch, 1)
print("result_average:")
print(result_average)
def main(args):
# Init data pipeline
dm, _ = LitDataModule(hparams=args)
# Init LitModel
if args.checkpoint_path is not None:
PATH = args.checkpoint_path
if PATH[-5:]=='.ckpt':
model = LitModel.load_from_checkpoint(PATH, map_location='cpu', num_classes=dm.num_classes, hparams=args)
print('Successfully load the pl checkpoint file.')
if args.pl_ckpt_2_torch_pth:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.model.to(device)
torch.save(model.state_dict(), PATH[:-5]+'.pth')
exit()
elif PATH[-4:] == '.pth':
model = LitModel(num_classes=dm.num_classes, hparams=args)
missing_keys, unexpected_keys = model.model.load_state_dict(torch.load(PATH), False)
# show for debug
print('missing_keys: ', missing_keys)
print('unexpected_keys: ', unexpected_keys)
else:
raise TypeError
else:
model = LitModel(num_classes=dm.num_classes, hparams=args)
if args.fuse_conv_bn:
fuse_conv_bn(model.model)
if args.measure_latency:
model = model.model
fps_mm(model, repetitions=120, num_warmup=20, infer_epoch=10)
if __name__ == "__main__":
parser = ArgumentParser()
parser.add_argument('-c', '--cfg', type=str, default='cfg/fasternet_t0.yaml')
parser.add_argument('-g', "--gpus", type=str, default="0",
help="Number of GPUs to train on (int) or which GPUs to train on (list or str) applied per node.")
parser.add_argument('-d', "--dev", type=int, default=0, help='fast_dev_run for debug')
parser.add_argument("--num_nodes", type=int, default=1)
parser.add_argument('-n', "--num_workers", type=int, default=4)
parser.add_argument('-b', "--batch_size", type=int, default=2048)
parser.add_argument('-e', "--batch_size_eva", type=int, default=1, help='batch_size for evaluation')
parser.add_argument("--model_ckpt_dir", type=str, default="./model_ckpt/")
parser.add_argument("--data_dir", type=str, default="../../data/imagenet")
parser.add_argument('--pin_memory', action='store_true')
parser.add_argument("--checkpoint_path", type=str, default=None)
parser.add_argument("--pconv_fw_type", type=str, default='slicing',
help="use 'split_cat' for training/inference and 'slicing' only for inference")
parser.add_argument('--measure_latency', action='store_true', help='measure latency or throughput')
parser.add_argument('--test_phase', action='store_true')
parser.add_argument('--fuse_conv_bn', action='store_true')
parser.add_argument("--wandb_project_name", type=str, default="fasternet")
parser.add_argument('--wandb_offline', action='store_true')
parser.add_argument('--wandb_save_dir', type=str, default='./')
parser.add_argument('--pl_ckpt_2_torch_pth', action='store_true',
help='convert pl .ckpt file to torch .pth file, and then exit')
args = parser.parse_args()
cfg = load_cfg(args.cfg)
args = merge_args_cfg(args, cfg)
# please change {WANDB_API_KEY} to your personal api_key before using wandb
# os.environ["WANDB_API_KEY"] = "{WANDB_API_KEY}"
main(args)
@wsy-yjys Hi, the slicing mode can be slower because of the feature map clone (see the code x = x.clone() ). Such clone is necessary to avoid modifying the input, which will be used for shortcut addition. However, if the shortcut is not placed exactly before the PConv, the slicing mode can be faster by removing the input clone.
Oh……,I got it, thank you~
Hi, I rethink your code implementation, why you not change shortcut = x to shortcut = x.clone(). Thus, we can comment out x = x.clone() in Partial conv3 ?


Good question, waiting for reply
@wsy-yjys Hi, shortcut = x is a shallow copy, which can be much faster than the deep copy version of x = x.clone(). Therefore, the implementation as you suggested would:
- Speed up the inference a bit or negligibly for the slicing mode.
- Slow down the inference (maybe considerably) for the concatenation mode.
@JierunChen Hi, thank you for your reply, I see. You are very responsible.