From 2cbe375a03962a3d8e32f996ef3534ab919259f8 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Tue, 7 Jan 2025 06:49:32 -0800 Subject: [PATCH] Inline G mix function addition steps to eliminate a function call. --- src/lib/workers/powgl.ts | 98 +++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 31 deletions(-) diff --git a/src/lib/workers/powgl.ts b/src/lib/workers/powgl.ts index 82f6bf8..1311fb1 100644 --- a/src/lib/workers/powgl.ts +++ b/src/lib/workers/powgl.ts @@ -126,50 +126,86 @@ const uint SIGMA82[192] = uint[192]( 28u,20u,8u,16u,18u,30u,26u,12u,2u,24u,0u,4u,22u,14u,10u,6u ); -// 64-bit unsigned addition within the compression buffer -// Sets v[a,a+1] += b -// b0 is the low 32 bits of b, b1 represents the high 32 bits -void add_uint64 (uint a, uint b0, uint b1) { - uint o0 = v[a] + b0; - uint o1 = v[a+1u] + b1; - if (v[a] > 0xFFFFFFFFu - b0) { // did low 32 bits overflow? - o1++; +// G mixing function +void G (uint ix, uint iy, uint a, uint b, uint c, uint d) { + uint o0; + uint o1; + uint xor0; + uint xor1; + + // a = a + b; + o0 = v[a] + v[b]; + o1 = v[a+1u] + v[b+1u]; + if (v[a] > 0xFFFFFFFFu - v[b]) { + o1 = o1 + 1u; } v[a] = o0; v[a+1u] = o1; -} -// G Mixing function -void B2B_G (uint a, uint b, uint c, uint d, uint ix, uint iy) { - add_uint64(a, v[b], v[b+1u]); - add_uint64(a, m[ix], m[ix+1u]); + // a = a + m[sigma[r][2*i+0]]; + o0 = v[a] + m[ix]; + o1 = v[a+1u] + m[ix+1u]; + if (v[a] > 0xFFFFFFFFu - m[ix]) { + o1 = o1 + 1u; + } + v[a] = o0; + v[a+1u] = o1; - // v[d,d+1] = (v[d,d+1] xor v[a,a+1]) rotated to the right by 32 bits - uint xor0 = v[d] ^ v[a]; - uint xor1 = v[d+1u] ^ v[a+1u]; + // d = rotr64(d ^ a, 32); + xor0 = v[d] ^ v[a]; + xor1 = v[d+1u] ^ v[a+1u]; v[d] = xor1; v[d+1u] = xor0; - add_uint64(c, v[d], v[d+1u]); + // c = c + d; + o0 = v[c] + v[d]; + o1 = v[c+1u] + v[d+1u]; + if (v[c] > 0xFFFFFFFFu - v[d]) { + o1 = o1 + 1u; + } + v[c] = o0; + v[c+1u] = o1; - // v[b,b+1] = (v[b,b+1] xor v[c,c+1]) rotated right by 24 bits + // b = rotr64(b ^ c, 24); xor0 = v[b] ^ v[c]; xor1 = v[b+1u] ^ v[c+1u]; v[b] = (xor0 >> 24u) ^ (xor1 << 8u); v[b+1u] = (xor1 >> 24u) ^ (xor0 << 8u); - add_uint64(a, v[b], v[b+1u]); - add_uint64(a, m[iy], m[iy+1u]); + // a = a + b; + o0 = v[a] + v[b]; + o1 = v[a+1u] + v[b+1u]; + if (v[a] > 0xFFFFFFFFu - v[b]) { + o1 = o1 + 1u; + } + v[a] = o0; + v[a+1u] = o1; - // v[d,d+1] = (v[d,d+1] xor v[a,a+1]) rotated right by 16 bits + // a = a + m[sigma[r][2*i+1]]; + o0 = v[a] + m[iy]; + o1 = v[a+1u] + m[iy+1u]; + if (v[a] > 0xFFFFFFFFu - m[iy]) { + o1 = o1 + 1u; + } + v[a] = o0; + v[a+1u] = o1; + + // d = rotr64(d ^ a, 16) xor0 = v[d] ^ v[a]; xor1 = v[d+1u] ^ v[a+1u]; v[d] = (xor0 >> 16u) ^ (xor1 << 16u); v[d+1u] = (xor1 >> 16u) ^ (xor0 << 16u); - add_uint64(c, v[d], v[d+1u]); + // c = c + d; + o0 = v[c] + v[d]; + o1 = v[c+1u] + v[d+1u]; + if (v[c] > 0xFFFFFFFFu - v[d]) { + o1 = o1 + 1u; + } + v[c] = o0; + v[c+1u] = o1; - // v[b,b+1] = (v[b,b+1] xor v[c,c+1]) rotated right by 63 bits + // b = rotr64(b ^ c, 63) xor0 = v[b] ^ v[c]; xor1 = v[b+1u] ^ v[c+1u]; v[b] = (xor1 >> 31u) ^ (xor0 << 1u); @@ -202,14 +238,14 @@ void main() { // twelve rounds of mixing for(uint i = 0u; i < 12u; i = i + 1u) { - B2B_G(0u, 8u, 16u, 24u, SIGMA82[i * 16u + 0u], SIGMA82[i * 16u + 1u]); - B2B_G(2u, 10u, 18u, 26u, SIGMA82[i * 16u + 2u], SIGMA82[i * 16u + 3u]); - B2B_G(4u, 12u, 20u, 28u, SIGMA82[i * 16u + 4u], SIGMA82[i * 16u + 5u]); - B2B_G(6u, 14u, 22u, 30u, SIGMA82[i * 16u + 6u], SIGMA82[i * 16u + 7u]); - B2B_G(0u, 10u, 20u, 30u, SIGMA82[i * 16u + 8u], SIGMA82[i * 16u + 9u]); - B2B_G(2u, 12u, 22u, 24u, SIGMA82[i * 16u + 10u], SIGMA82[i * 16u + 11u]); - B2B_G(4u, 14u, 16u, 26u, SIGMA82[i * 16u + 12u], SIGMA82[i * 16u + 13u]); - B2B_G(6u, 8u, 18u, 28u, SIGMA82[i * 16u + 14u], SIGMA82[i * 16u + 15u]); + G(SIGMA82[i * 16u + 0u], SIGMA82[i * 16u + 1u], 0u, 8u, 16u, 24u); + G(SIGMA82[i * 16u + 2u], SIGMA82[i * 16u + 3u], 2u, 10u, 18u, 26u); + G(SIGMA82[i * 16u + 4u], SIGMA82[i * 16u + 5u], 4u, 12u, 20u, 28u); + G(SIGMA82[i * 16u + 6u], SIGMA82[i * 16u + 7u], 6u, 14u, 22u, 30u); + G(SIGMA82[i * 16u + 8u], SIGMA82[i * 16u + 9u], 0u, 10u, 20u, 30u); + G(SIGMA82[i * 16u + 10u], SIGMA82[i * 16u + 11u], 2u, 12u, 22u, 24u); + G(SIGMA82[i * 16u + 12u], SIGMA82[i * 16u + 13u], 4u, 14u, 16u, 26u); + G(SIGMA82[i * 16u + 14u], SIGMA82[i * 16u + 15u], 6u, 8u, 18u, 28u); } // Pixel data is multipled by threshold test result (0 or 1) -- 2.34.1