From 2e3e8dc32fa6aab78b24e74202cb9a5b0ed8b48d Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Wed, 8 Jan 2025 12:35:11 -0800 Subject: [PATCH] Inline G mix 2 of round 0. Interestingly, we can skip some instructions this way due to addition of zero doing nothing to mix values, but the operation itself will be left visible for documentation and possible troubleshooting. Also note that pow values are verified working at this commit. --- src/lib/nano-pow/shaders/gpu-compute.ts | 83 ++++++++++++++++++++++++- 1 file changed, 82 insertions(+), 1 deletion(-) diff --git a/src/lib/nano-pow/shaders/gpu-compute.ts b/src/lib/nano-pow/shaders/gpu-compute.ts index 4c30b97..c9e88cc 100644 --- a/src/lib/nano-pow/shaders/gpu-compute.ts +++ b/src/lib/nano-pow/shaders/gpu-compute.ts @@ -378,8 +378,89 @@ fn main( /** * r=0, i=2(x2), a=v[2-3], b=v[10-11], c=v[18-19], d=v[26-27] + */ + + // a = a + b + o0 = v4 + v12; + o1 = v5 + v13; + if (v4 > 0xFFFFFFFFu - v12) { + o1 = o1 + 1u; + } + v4 = o0; + v5 = o1; + + // a = a + m[sigma[r][2*i+0]] + o0 = v4 + m8; + o1 = v5 + m9; + if (v4 > 0xFFFFFFFFu - m8) { + o1 = o1 + 1u; + } + v4 = o0; + v5 = o1; + + // d = rotr64(d ^ a, 32) + xor0 = v28 ^ v4; + xor1 = v29 ^ v5; + v28 = xor1; + v29 = xor0; + + // c = c + d + o0 = v20 + v28; + o1 = v21 + v29; + if (v20 > 0xFFFFFFFFu - v28) { + o1 = o1 + 1u; + } + v20 = o0; + v21 = o1; + + // b = rotr64(b ^ c, 24) + xor0 = v12 ^ v20; + xor1 = v13 ^ v21; + v12 = (xor0 >> 24u) ^ (xor1 << 8u); + v13 = (xor1 >> 24u) ^ (xor0 << 8u); + + // a = a + b + o0 = v4 + v12; + o1 = v5 + v13; + if (v4 > 0xFFFFFFFFu - v12) { + o1 = o1 + 1u; + } + v4 = o0; + v5 = o1; + + // a = a + m[sigma[r][2*i+1]] + // skip since adding 0u does nothing + // o0 = v4 + 0u; + // o1 = v5 + 0u; + // if (v4 > 0xFFFFFFFFu - 0u) { + // o1 = o1 + 1u; + // } + // v4 = o0; + // v5 = o1; + + // d = rotr64(d ^ a, 16) + xor0 = v28 ^ v4; + xor1 = v29 ^ v5; + v28 = (xor0 >> 16u) ^ (xor1 << 16u); + v29 = (xor1 >> 16u) ^ (xor0 << 16u); + + // c = c + d + o0 = v20 + v28; + o1 = v21 + v29; + if (v20 > 0xFFFFFFFFu - v28) { + o1 = o1 + 1u; + } + v20 = o0; + v21 = o1; + + // b = rotr64(b ^ c, 63) + xor0 = v12 ^ v20; + xor1 = v13 ^ v21; + v12 = (xor1 >> 31u) ^ (xor0 << 1u); + v13 = (xor0 >> 31u) ^ (xor1 << 1u); + + - G(&v4, &v5, &v12, &v13, &v20, &v21, &v28, &v29, m8, m9, 0u, 0u); G(&v6, &v7, &v14, &v15, &v22, &v23, &v30, &v31, 0u, 0u, 0u, 0u); G(&v0, &v1, &v10, &v11, &v20, &v21, &v30, &v31, 0u, 0u, 0u, 0u); G(&v2, &v3, &v12, &v13, &v22, &v23, &v24, &v25, 0u, 0u, 0u, 0u); -- 2.34.1