torch
torch copied to clipboard
Factorized Tensor slower than Neural Network Layer !!!
import tltorch
import torch
from torch.profiler import profile, record_function, ProfilerActivity
data = torch.randn((4, 16), dtype=torch.float32)
linear = torch.nn.Linear(16, 10)
fact_linear = tltorch.FactorizedLinear.from_linear(linear, auto_tensorize=False,
in_tensorized_features=(4, 4), out_tensorized_features=(2, 5), rank=0.1, factorization="tucker")
data = data.to("cuda")
linear = linear.to("cuda")
fact_linear = fact_linear.to("cuda")
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
with record_function("model_inference"):
linear(data)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
model_inference 16.99% 1.054ms 99.71% 6.186ms 6.186ms 0.000us 0.00% 4.000us 4.000us 1
aten::linear 0.29% 18.000us 82.72% 5.132ms 5.132ms 0.000us 0.00% 4.000us 4.000us 1
aten::addmm 60.54% 3.756ms 81.43% 5.052ms 5.052ms 4.000us 100.00% 4.000us 4.000us 1
void gemmSN_TN_kernel<float, 128, 16, 2, 4, 4, 4, tr... 0.00% 0.000us 0.00% 0.000us 0.000us 4.000us 100.00% 4.000us 4.000us 1
aten::t 0.63% 39.000us 1.00% 62.000us 62.000us 0.000us 0.00% 0.000us 0.000us 1
aten::transpose 0.24% 15.000us 0.37% 23.000us 23.000us 0.000us 0.00% 0.000us 0.000us 1
aten::as_strided 0.13% 8.000us 0.13% 8.000us 8.000us 0.000us 0.00% 0.000us 0.000us 1
cudaLaunchKernel 20.89% 1.296ms 20.89% 1.296ms 1.296ms 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceSynchronize 0.29% 18.000us 0.29% 18.000us 18.000us 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 6.204ms
Self CUDA time total: 4.000us
with profile(activities=[
ProfilerActivity.CPU, ProfilerActivity.CUDA], record_shapes=True) as prof:
with record_function("model_inference"):
fact_linear(data)
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
model_inference 15.19% 1.098ms 99.79% 7.215ms 7.215ms 0.000us 0.00% 27.000us 27.000us 1
aten::matmul 0.40% 29.000us 62.63% 4.528ms 1.132ms 0.000us 0.00% 12.000us 3.000us 4
aten::mm 48.37% 3.497ms 62.23% 4.499ms 1.125ms 12.000us 44.44% 12.000us 3.000us 4
aten::reshape 0.91% 66.000us 6.10% 441.000us 44.100us 0.000us 0.00% 10.000us 1.000us 10
aten::clone 0.55% 40.000us 3.91% 283.000us 94.333us 0.000us 0.00% 10.000us 3.333us 3
aten::copy_ 1.40% 101.000us 2.28% 165.000us 55.000us 10.000us 37.04% 10.000us 3.333us 3
void at::native::elementwise_kernel<128, 2, at::nati... 0.00% 0.000us 0.00% 0.000us 0.000us 10.000us 37.04% 10.000us 3.333us 3
void gemmk1_kernel<int, float, 256, 5, false, false,... 0.00% 0.000us 0.00% 0.000us 0.000us 9.000us 33.33% 9.000us 3.000us 3
aten::linear 0.36% 26.000us 2.23% 161.000us 161.000us 0.000us 0.00% 5.000us 5.000us 1
aten::addmm 1.00% 72.000us 1.27% 92.000us 92.000us 5.000us 18.52% 5.000us 5.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 7.230ms
Self CUDA time total: 27.000us
I'm also finding that the tltorch tensorized layers are significantly slower than the standard torch fully connected layers. Did you ever find a solution to this issue?
What implementation do you use? reconstructed or factorized? Reconstructed should be around the same amount of time as the unfactorized version. The factorized version will depend significantly on your specific use case: size of your matrix, size of the factorization, etc.
I verified the runtime of both factorized and reconstructed implements, but also the inference time is still slower than Neural Network.
import tltorch
import torch
from torch.profiler import profile, record_function, ProfilerActivity
data = torch.randn((4, 1024), dtype=torch.float32)
linear = torch.nn.Linear(1024, 512)
fact_linear_w_factorized = tltorch.FactorizedLinear.from_linear(linear, auto_tensorize=False,
in_tensorized_features=(4, 4, 4, 4, 4), out_tensorized_features=(2,4, 4, 4, 4), rank=0.1, factorization="tucker", implementation="factorized")
fact_linear_w_reconstructed = tltorch.FactorizedLinear.from_linear(linear, auto_tensorize=False,
in_tensorized_features=(4, 4, 4, 4, 4), out_tensorized_features=(2,4, 4, 4, 4), rank=0.1, factorization="tucker", implementation="reconstructed")
data = data.to("cuda")
linear = linear.to("cuda")
fact_linear_w_factorized = fact_linear_w_factorized.to("cuda")
fact_linear_w_reconstructed = fact_linear_w_reconstructed.to("cuda")
%%timeit
with torch.no_grad():
fact_linear_w_reconstructed(data)
=> 880 µs ± 8.69 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each
%%timeit
with torch.no_grad():
fact_linear_w_factorized(data)
=> 2.74 ms ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
with torch.no_grad():
linear(data)
=> 38.7 µs ± 946 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)