Arraymancer SlicerImpl template is slow

SlicerImpl template is slow

Open mratsim opened this issue 5 years ago • 0 comments

In ex01_bench slicing is quite inefficient doing a relu activation is actually cheaper

2018-12-15_17-15-14

Focusing on the call tree, the slowness seems to be in implementation itself:

2018-12-15_17-16-11

Assembly

2018-12-15_17-18-31

2018-12-15_17-18-53

2018-12-15_17-19-08

Analysis

As expected 20% of the time is spent in integer division.
The movq instructions at the beginning corresponds the copy of metadata arrays
The movaps at the beginning are zero-ing (a lot) of stack memory
FUrther inefficiencies to analyze

C code

N_LIB_PRIVATE N_NIMCALL(void, slicer_6TJobm9a5fqL8CcSU7Kmotg)(tyObject_Tensor_JemaD8Ay5p2W2ZM1GrtEmQ* t, tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg* slices, NI slicesLen_0, tyObject_Tensor_JemaD8Ay5p2W2ZM1GrtEmQ* Result) {
	(*Result).shape = (*t).shape;
	(*Result).strides = (*t).strides;
	(*Result).offset = (*t).offset;
	unsureAsgnRef((void**) (&(*Result).storage.Fdata), (*t).storage.Fdata);
	{
		NI i;
		tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg slice;
		tyObject_DynamicStackArray_wRBrWDS73mcYvSRSW2Fvtw colontmp_;
		i = (NI)0;
		nimZeroMem((void*)(&slice), sizeof(tyObject_SteppedSlice_r4gi6b3i9cwWvbDTx29bhqpg));
		nimZeroMem((void*)(&colontmp_), sizeof(tyObject_DynamicStackArray_wRBrWDS73mcYvSRSW2Fvtw));
		colontmp_ = toArrayOfSlices_49aX4vpzwvrg5vfLxBPRKtgp_accessors_macros_read(slices, slicesLen_0);
		{
			NI i_2;
			NI colontmp__2;
			NI i_3;
			i_2 = (NI)0;
			colontmp__2 = (NI)0;
			colontmp__2 = colontmp_.len;
			i_3 = ((NI) 0);
			{
				while (1) {
					NI a;
					NI b;
					NI* T17_;
					NI* T18_;
					if (!(i_3 < colontmp__2)) goto LA4;
					i_2 = i_3;
					i = i_2;
					slice = colontmp_.data[(i_2)- 0];
					{
						NI* T9_;
						if (!slice.a_from_end) goto LA7_;
						T9_ = (NI*)0;
						T9_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).shape), i);
						a = (NI)((*T9_) - slice.a);
					}
					goto LA5_;
					LA7_: ;
					{
						a = slice.a;
					}
					LA5_: ;
					{
						NI* T15_;
						if (!slice.b_from_end) goto LA13_;
						T15_ = (NI*)0;
						T15_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).shape), i);
						b = (NI)((*T15_) - slice.b);
					}
					goto LA11_;
					LA13_: ;
					{
						b = slice.b;
					}
					LA11_: ;
					T17_ = (NI*)0;
					T17_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).strides), i);
					(*Result).offset += (NI)(a * (*T17_));
					T18_ = (NI*)0;
					T18_ = X5BX5D__hbVK8t1uBzeiL7wEMkEqdQhigher_order_applymap((&(*Result).strides), i);
					stareq__tk9bwZBdb9bO9baUcUN2AX89bQdata_structure(T18_, slice.step);
					X5BX5Deq__HrkcdSE0L9cV8m9ahTqvmyOQdata_structure((&(*Result).shape), i, (NI)(((NI)((NI)(b - a) / slice.step) > 0? ((NI)((NI)(b - a) / slice.step)) : -((NI)((NI)(b - a) / slice.step))) + ((NI) 1)));
					i_3 += ((NI) 1);
				} LA4: ;
			}
		}
	}
}
N_LIB_PRIVATE N_NIMCALL(void, arraymancer_p_accessors_macros_readInit000)(void) {
{
	TFrame FR_; FR_.len = 0;
}
}

N_LIB_PRIVATE N_NIMCALL(void, arraymancer_p_accessors_macros_readDatInit000)(void) {
}

Dec 15 '18 16:12 mratsim

Arraymancer Arraymancer copied to clipboard

SlicerImpl template is slow

Assembly

Analysis

C code

Arraymancer
Arraymancer copied to clipboard