SIMD uint8x16 to 4 of uint32x4 with transpose?
Hello. I have question. How to drop uint8x16 to 4 of uint32x4 with transpose? Known that notation with rgba rgba rgba rgba to rrrr gggg bbbb aaaa. And backward four 32x4 to single uint8x16, again with transpose?
So you want one uint8x16 turned into four uint32x4 like this:
input (uint8x16): rgba,rgba,rgba,rgba (r, g, b, and a are each 8 bits)
output 0 (uint32x4): r,r,r,r (each r is 32 bits) output 1 (uint32x4): g,g,g,g (each g is 32 bits) output 2 (uint32x4): b,b,b,b (each b is 32 bits) output 3 (uint32x4): a,a,a,a (each a is 32 bits)
and vice-versa.
Is my understanding correct?
correct
Maybe something like this will do the trick:
function to4xUint32x4(src, dst) {
var zerox16 = SIMD.Uint8x16.splat(0);
var res0 = SIMD.Uint8x16.shuffle(src, zerox16, 0, 16, 16, 16, 4, 16, 16, 16, 8, 16, 16, 16, 12, 16, 16, 16);
var res1 = SIMD.Uint8x16.shuffle(src, zerox16, 1, 16, 16, 16, 5, 16, 16, 16, 9, 16, 16, 16, 13, 16, 16, 16);
var res2 = SIMD.Uint8x16.shuffle(src, zerox16, 2, 16, 16, 16, 6, 16, 16, 16, 10, 16, 16, 16, 14, 16, 16, 16);
var res3 = SIMD.Uint8x16.shuffle(src, zerox16, 3, 16, 16, 16, 7, 16, 16, 16, 11, 16, 16, 16, 15, 16, 16, 16);
SIMD.Uint32x4.store(dst, 0, SIMD.Uint32x4.fromUint8x16Bits(res0));
SIMD.Uint32x4.store(dst, 4, SIMD.Uint32x4.fromUint8x16Bits(res1));
SIMD.Uint32x4.store(dst, 8, SIMD.Uint32x4.fromUint8x16Bits(res2));
SIMD.Uint32x4.store(dst, 12, SIMD.Uint32x4.fromUint8x16Bits(res3));
}
// small test for the function above
var input = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
var output = new Uint32Array(16);
to4xUint32x4(input, output);
console.log(output);
Note: I assume that the 'r's are in input lane 0, 4, 8, and 12. 'g's are in input lane 1, 5, 9, 13. Etc.
How about the following?
var mask = SIMD.Uint32x4.splat(0xFF); // Constant: create outside the hot loop.
var input = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
input = SIMD.Uint32x4.fromUint8x16Bits(input); // No-op
var r = SIMD.Uint32x4.and(input, mask); // PAND: 1 cycle in port 0,1 or 5 on Intel Haswell and newer
var g = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 8), mask); // PSRLD: 1 cycle in port 0, and PAND for 1 cycle in port 0, 1 or 5, on Intel Haswell and newer.
var b = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 16), mask);
var a = SIMD.Uint32x4.and(SIMD.Uint32x4.shiftRightByScalar(input, 24), mask);
console.log(r.toString()); // Prints SIMD.Uint32x4(0, 4, 8, 12)
console.log(g.toString()); // Prints SIMD.Uint32x4(1, 5, 9, 13)
console.log(b.toString()); // Prints SIMD.Uint32x4(2, 6, 10, 14)
console.log(a.toString()); // Prints SIMD.Uint32x4(3, 7, 11, 15)
On native SSE code, that would be expected to run in four cycles (of throughput cost) per iteration, since the and of a previous color component, and the shiftRightByScalar of the next color component can be scheduled parallel on different ports on Haswell and newer architectures.
Why you not create gitter? Also, how to revert to uint8, and as been?
The reverse operation could be done like this:
function toUint8x16(src) {
var res;
var src0 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 0));
var src1 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 4));
var src2 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 8));
var src3 = SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.load(src, 12));
res = src0; // the 'r's are already where they need to be
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12, 1, 1));
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src2, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12, 1));
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.swizzle(src3, 1, 1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12));
return res;
}
// small test for the functions above
var input = SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
var output = new Uint32Array(16);
to4xUint32x4(input, output);
console.log(toUint8x16(output).toString()); // prints SIMD.Uint8x16(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
Not sure, what you mean by gitter?
gitter.im - it's a chat room for repos. @acterhd sometimes maintainers don't want the increased cost of having to check yet another place for support.
In @PeterJensen's reverse operation, the swizzles assume that the inputs are in uint8 range, and all those swizzles of lane 1 assume that they will be receiving zeroes, or the or operation will generate garbage?
I doubt that the above code patterns with proposed swizzles or shuffles will have good performance, since they use the kind of swizzle and shuffle patterns that do not exist in native SSE or NEON as a fast operation. Assuming that the lane 1 has the value of 0, the code
SIMD.Uint8x16.swizzle(src1, 1, 0, 1, 1, 1, 4, 1, 1, 1, 8, 1, 1, 1, 12, 1, 1)
is better written as
SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(SIMD.UInt32x4.fromUint8x16Bits(src1), 8))
which could map to the PSLLD instruction that is 1 throughput clock cycle of work.
Thanks @juj much better!
The code can be simplified a bit more (fewer conversions), if the input values (srcx) are kept as Uint32x4 values. The complete function now looks like this:
function toUint8x16(src) {
var res;
var src0 = SIMD.Uint32x4.load(src, 0);
var src1 = SIMD.Uint32x4.load(src, 4);
var src2 = SIMD.Uint32x4.load(src, 8);
var src3 = SIMD.Uint32x4.load(src, 12);
res = SIMD.Uint8x16.fromUint32x4Bits(src0); // the 'r's are already where they need to be
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src1, 8)));
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src2, 16)));
res = SIMD.Uint8x16.or(res, SIMD.Uint8x16.fromUint32x4Bits(SIMD.Uint32x4.shiftLeftByScalar(src3, 24)));
return res;
}
So a total of 3 shift and 3 or operations