Arraymancer
Arraymancer copied to clipboard
SlicerImpl template is slow
In ex01_bench
slicing is quite inefficient doing a relu activation is actually cheaper
Focusing on the call tree, the slowness seems to be in implementation itself:
Assembly
Analysis
-
As expected 20% of the time is spent in integer division.
-
The movq instructions at the beginning corresponds the copy of metadata arrays
-
The
movaps
at the beginning are zero-ing (a lot) of stack memory -
FUrther inefficiencies to analyze
C code
N_LIB_PRIVATE N_NIMCALL(void, slicer_6TJobm9a5fqL8CcSU7Kmotg)(tyObject_Tensor_JemaD8Ay5p2W2ZM1GrtEmQ* t, tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg* slices, NI slicesLen_0, tyObject_Tensor_JemaD8Ay5p2W2ZM1GrtEmQ* Result) {
(*Result).shape = (*t).shape;
(*Result).strides = (*t).strides;
(*Result).offset = (*t).offset;
unsureAsgnRef((void**) (&(*Result).storage.Fdata), (*t).storage.Fdata);
{
NI i;
tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg slice;
tyObject_DynamicStackArray_wRBrWDS73mcYvSRSW2Fvtw colontmp_;
i = (NI)0;
nimZeroMem((void*)(&slice), sizeof(tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg));
nimZeroMem((void*)(&colontmp_), sizeof(tyObject_DynamicStackArray_wRBrWDS73mcYvSRSW2Fvtw));
colontmp_ = toArrayOfSlices_49aX4vpzwvrg5vfLxBPRKtgp_accessors_macros_read(slices, slicesLen_0);
{
NI i_2;
NI colontmp__2;
NI i_3;
i_2 = (NI)0;
colontmp__2 = (NI)0;
colontmp__2 = colontmp_.len;
i_3 = ((NI) 0);
{
while (1) {
NI a;
NI b;
NI* T17_;
NI* T18_;
if (!(i_3 < colontmp__2)) goto LA4;
i_2 = i_3;
i = i_2;
slice = colontmp_.data[(i_2)- 0];
{
NI* T9_;
if (!slice.a_from_end) goto LA7_;
T9_ = (NI*)0;
T9_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).shape), i);
a = (NI)((*T9_) - slice.a);
}
goto LA5_;
LA7_: ;
{
a = slice.a;
}
LA5_: ;
{
NI* T15_;
if (!slice.b_from_end) goto LA13_;
T15_ = (NI*)0;
T15_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).shape), i);
b = (NI)((*T15_) - slice.b);
}
goto LA11_;
LA13_: ;
{
b = slice.b;
}
LA11_: ;
T17_ = (NI*)0;
T17_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).strides), i);
(*Result).offset += (NI)(a * (*T17_));
T18_ = (NI*)0;
T18_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).strides), i);
stareq__tk9bwZBdb9bO9baUcUN2AX89bQdata_structure(T18_, slice.step);
X5BX5Deq__HrkcdSE0L9cV8m9ahTqvmyOQdata_structure((&(*Result).shape), i, (NI)(((NI)((NI)(b - a) / slice.step) > 0? ((NI)((NI)(b - a) / slice.step)) : -((NI)((NI)(b - a) / slice.step))) + ((NI) 1)));
i_3 += ((NI) 1);
} LA4: ;
}
}
}
}
N_LIB_PRIVATE N_NIMCALL(void, arraymancer_p_accessors_macros_readInit000)(void) {
{
TFrame FR_; FR_.len = 0;
}
}
N_LIB_PRIVATE N_NIMCALL(void, arraymancer_p_accessors_macros_readDatInit000)(void) {
}