cccl
cccl copied to clipboard
Add support for large `num_items` to `DevicePartition::ThreeWayPartition`
Simply instantiating the kernel template with different offset types unfortunately does degrade performance by as much as 1.66-fold. This needs further investigation on alternative approaches, like streaming, bit-packed tile state, etc.
H100: partition.three_way
T{ct} | Elements{io} | Entropy | I32 | i64 | i64/i32 time | |
---|---|---|---|---|---|---|
I8 | 2^16 = 65536 | 1 | 12.118 | 15.23 | 125.68% | |
I8 | 2^20 = 1048576 | 1 | 14.703 | 19.589 | 133.23% | |
I8 | 2^24 = 16777216 | 1 | 70.194 | 112.999 | 160.98% | |
I8 | 2^28 = 268435456 | 1 | 1002 | 1664 | 166.07% | |
I8 | 2^16 = 65536 | 0.544 | 11.982 | 14.886 | 124.24% | |
I8 | 2^20 = 1048576 | 0.544 | 14.219 | 19.278 | 135.58% | |
I8 | 2^24 = 16777216 | 0.544 | 68.941 | 110.528 | 160.32% | |
I8 | 2^28 = 268435456 | 0.544 | 986.474 | 1631 | 165.34% | |
I8 | 2^16 = 65536 | 0 | 11.358 | 14.45 | 127.22% | |
I8 | 2^20 = 1048576 | 0 | 14.076 | 18.891 | 134.21% | |
I8 | 2^24 = 16777216 | 0 | 68.288 | 108.767 | 159.28% | |
I8 | 2^28 = 268435456 | 0 | 978.981 | 1606 | 164.05% | |
I16 | 2^16 = 65536 | 1 | 12.051 | 18.597 | 154.32% | |
I16 | 2^20 = 1048576 | 1 | 15.497 | 21.508 | 138.79% | |
I16 | 2^24 = 16777216 | 1 | 77.899 | 123.185 | 158.13% | |
I16 | 2^28 = 268435456 | 1 | 1079 | 1645 | 152.46% | |
I16 | 2^16 = 65536 | 0.544 | 11.86 | 18.326 | 154.52% | |
I16 | 2^20 = 1048576 | 0.544 | 14.928 | 21.2 | 142.02% | |
I16 | 2^24 = 16777216 | 0.544 | 77.252 | 120.884 | 156.48% | |
I16 | 2^28 = 268435456 | 0.544 | 1067 | 1627 | 152.48% | |
I16 | 2^16 = 65536 | 0 | 11.435 | 17.837 | 155.99% | |
I16 | 2^20 = 1048576 | 0 | 14.583 | 20.722 | 142.10% | |
I16 | 2^24 = 16777216 | 0 | 76.026 | 119.004 | 156.53% | |
I16 | 2^28 = 268435456 | 0 | 1055 | 1585 | 150.24% | |
I32 | 2^16 = 65536 | 1 | 11.965 | 14.348 | 119.92% | |
I32 | 2^20 = 1048576 | 1 | 18.452 | 20.986 | 113.73% | |
I32 | 2^24 = 16777216 | 1 | 121.937 | 135.45 | 111.08% | |
I32 | 2^28 = 268435456 | 1 | 1747 | 1967 | 112.59% | |
I32 | 2^16 = 65536 | 0.544 | 11.847 | 14.276 | 120.50% | |
I32 | 2^20 = 1048576 | 0.544 | 17.918 | 20.967 | 117.02% | |
I32 | 2^24 = 16777216 | 0.544 | 120.304 | 133.606 | 111.06% | |
I32 | 2^28 = 268435456 | 0.544 | 1719 | 1938 | 112.74% | |
I32 | 2^16 = 65536 | 0 | 11.299 | 14.224 | 125.89% | |
I32 | 2^20 = 1048576 | 0 | 17.511 | 20.992 | 119.88% | |
I32 | 2^24 = 16777216 | 0 | 119.674 | 131.536 | 109.91% | |
I32 | 2^28 = 268435456 | 0 | 1710 | 1911 | 111.75% | |
I64 | 2^16 = 65536 | 1 | 10.847 | 14.181 | 130.74% | |
I64 | 2^20 = 1048576 | 1 | 20.423 | 23.689 | 115.99% | |
I64 | 2^24 = 16777216 | 1 | 170.387 | 198.108 | 116.27% | |
I64 | 2^28 = 268435456 | 1 | 2615 | 2985 | 114.15% | |
I64 | 2^16 = 65536 | 0.544 | 10.959 | 13.895 | 126.79% | |
I64 | 2^20 = 1048576 | 0.544 | 20.469 | 23.841 | 116.47% | |
I64 | 2^24 = 16777216 | 0.544 | 170.317 | 197.888 | 116.19% | |
I64 | 2^28 = 268435456 | 0.544 | 2607 | 2988 | 114.61% | |
I64 | 2^16 = 65536 | 0 | 10.448 | 13.869 | 132.74% | |
I64 | 2^20 = 1048576 | 0 | 19.825 | 23.241 | 117.23% | |
I64 | 2^24 = 16777216 | 0 | 167.539 | 194.209 | 115.92% | |
I64 | 2^28 = 268435456 | 0 | 2580 | 2935 | 113.76% | |
I128 | 2^16 = 65536 | 1 | 11.812 | 14.185 | 120.09% | |
I128 | 2^20 = 1048576 | 1 | 29.403 | 35.674 | 121.33% | |
I128 | 2^24 = 16777216 | 1 | 324.797 | 358.436 | 110.36% | |
I128 | 2^28 = 268435456 | 1 | 5088 | 5572 | 109.51% | |
I128 | 2^16 = 65536 | 0.544 | 11.644 | 14.146 | 121.49% | |
I128 | 2^20 = 1048576 | 0.544 | 29.383 | 35.858 | 122.04% | |
I128 | 2^24 = 16777216 | 0.544 | 323.572 | 358.372 | 110.75% | |
I128 | 2^28 = 268435456 | 0.544 | 5080 | 5570 | 109.65% | |
I128 | 2^16 = 65536 | 0 | 11.689 | 13.858 | 118.56% | |
I128 | 2^20 = 1048576 | 0 | 28.922 | 35.357 | 122.25% | |
I128 | 2^24 = 16777216 | 0 | 321.651 | 353.956 | 110.04% | |
I128 | 2^28 = 268435456 | 0 | 5046 | 5505 | 109.10% | |
F32 | 2^16 = 65536 | 1 | 11.763 | 14.17 | 120.46% | |
F32 | 2^20 = 1048576 | 1 | 18.612 | 20.809 | 111.80% | |
F32 | 2^24 = 16777216 | 1 | 122.412 | 134.803 | 110.12% | |
F32 | 2^28 = 268435456 | 1 | 1747 | 1956 | 111.96% | |
F32 | 2^16 = 65536 | 0.544 | 11.698 | 14.074 | 120.31% | |
F32 | 2^20 = 1048576 | 0.544 | 17.963 | 20.645 | 114.93% | |
F32 | 2^24 = 16777216 | 0.544 | 120.525 | 132.563 | 109.99% | |
F32 | 2^28 = 268435456 | 0.544 | 1722 | 1920 | 111.50% | |
F32 | 2^16 = 65536 | 0 | 11.392 | 14.117 | 123.92% | |
F32 | 2^20 = 1048576 | 0 | 17.548 | 20.579 | 117.27% | |
F32 | 2^24 = 16777216 | 0 | 119.586 | 130.402 | 109.04% | |
F32 | 2^28 = 268435456 | 0 | 1709 | 1895 | 110.88% | |
F64 | 2^16 = 65536 | 1 | 10.566 | 13.528 | 128.03% | |
F64 | 2^20 = 1048576 | 1 | 19.814 | 23.491 | 118.56% | |
F64 | 2^24 = 16777216 | 1 | 169.257 | 195.281 | 115.38% | |
F64 | 2^28 = 268435456 | 1 | 2603 | 2953 | 113.45% | |
F64 | 2^16 = 65536 | 0.544 | 10.609 | 13.316 | 125.52% | |
F64 | 2^20 = 1048576 | 0.544 | 19.794 | 23.263 | 117.53% | |
F64 | 2^24 = 16777216 | 0.544 | 169.404 | 195.539 | 115.43% | |
F64 | 2^28 = 268435456 | 0.544 | 2597 | 2959 | 113.94% | |
F64 | 2^16 = 65536 | 0 | 10.414 | 13.543 | 130.05% | |
F64 | 2^20 = 1048576 | 0 | 19.461 | 23.152 | 118.97% | |
F64 | 2^24 = 16777216 | 0 | 166.736 | 191.613 | 114.92% | |
F64 | 2^28 = 268435456 | 0 | 2572 | 2904 | 112.91% |