From: Chris Duncan Date: Wed, 5 Feb 2025 05:01:41 +0000 (-0800) Subject: Benchmarking shows vec4 version actually ended up being slower, probably due to incre... X-Git-Tag: v3.0.0~29 X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=98db460d1d13a3580e8e1dc8c6c55f8d629bcc1f;p=nano-pow.git Benchmarking shows vec4 version actually ended up being slower, probably due to increased overhead and register pressure. Revert to vec2 implementation. --- diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index f6a11e5..b021618 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -20,15 +20,21 @@ struct WORK { }; @group(0) @binding(1) var work: WORK; +/** +* +* Numeric literal used in the finalization digest is the original value of the +* first element of the initialization vector:`blake2b_IV[0]`. +*/ +const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u); + /** * Used to rotate bits by a fixed amount during G mixing. */ -const Z = vec2(0u); -const ROTATE_1 = vec4(1u); -const ROTATE_8 = vec4(8u); -const ROTATE_16 = vec4(16u); -const ROTATE_24 = vec4(24u); -const ROTATE_31 = vec4(31u); +const ROTATE_1 = vec2(1u); +const ROTATE_8 = vec2(8u); +const ROTATE_16 = vec2(16u); +const ROTATE_24 = vec2(24u); +const ROTATE_31 = vec2(31u); /** * Shared flag to prevent execution for all workgroup threads based on the @@ -99,38 +105,36 @@ fn main(id: vec3) { * It is always the "last" compression at this INLEN * v14 = ~v14; */ - var v01: vec4 = vec4(0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u); - var v23: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); - var v45: vec4 = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); - var v67: vec4 = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u); - var v89: vec4 = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u); - var vAB: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); - var vCD: vec4 = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); - var vEF: vec4 = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u); - - /** - * Temporary variables used for subprocesses i=4 through i=7 - */ - var v56: vec4; - var vFC: vec4; - var v74: vec4; - var vDE: vec4; - - /** - * Twelve rounds of G mixing as part of BLAKE2b compression step. Normally, - * each round is divided into eight subprocesses; NanoPow compresses these - * operations into four subprocesses by executing sequential pairs - * simultaneously, inspired by https://github.com/minio/blake2b-simd. It then - * executes each compressed statement in pairs so that the compiler can - * interleave independent instructions and improve scheduling. That is to say, - * to execute `a = a + b` for subprocesses 1-4, first 1 is paired with 2 and 3 - * is paired with 4; then 1/2 is executed and 3/4 is executed; then the next - * computation `a = a + m[sigma[r][2*i+0]]` is executed in the same manner, and - * so on through all the steps of the subprocess. + var v0: vec2 = BLAKE2B_IV_0; + var v1: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); + var v2: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); + var v3: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); + var v4: vec2 = vec2(0xADE682D1u, 0x510E527Fu); + var v5: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); + var v6: vec2 = vec2(0xFB41BD6Bu, 0x1F83D9ABu); + var v7: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); + var v8: vec2 = vec2(0xF3BCC908u, 0x6A09E667u); + var v9: vec2 = vec2(0x84CAA73Bu, 0xBB67AE85u); + var v10: vec2 = vec2(0xFE94F82Bu, 0x3C6EF372u); + var v11: vec2 = vec2(0x5F1D36F1u, 0xA54FF53Au); + var v12: vec2 = vec2(0xADE682F9u, 0x510E527Fu); + var v13: vec2 = vec2(0x2B3E6C1Fu, 0x9B05688Cu); + var v14: vec2 = vec2(0x04BE4294u, 0xE07C2654u); + var v15: vec2 = vec2(0x137E2179u, 0x5BE0CD19u); + + /** + * Twelve rounds of G mixing as part of BLAKE2b compression step, each divided + * into eight subprocesses. Each statement of the first four subprocesses is + * executed in sequence so that the compiler can interleave independent + * instructions for improved scheduling. That is to say, the first statement + * `a = a + b` is executed for subprocesses 1-4, and then the next statement + * `a = a + m[sigma[r][2*i+0]]` is executed, and so on through all the steps of + * the mixing function. Once subprocesses 1-4 are done, computation on + * subprocesses 5-8 are executed in the same manner. * - * Each subprocess applies transformations to to `m` and `v` variables based on - * a defined set of index inputs. The algorithm for each subprocess is defined - * as follows: + * Each subprocess applies transformations to `m` and `v` variables based on a + * defined set of index inputs. The algorithm for each subprocess is defined as + * follows: * * r is the current round * i is the current subprocess within that round @@ -158,86 +162,106 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1 - * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3 - * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5 - * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7 + * r=0, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m0, m2) + vec4(0u, u32(v01.x + m0.x < v01.x), 0u, u32(v01.z + m2.x < v01.z)); - v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m1, m3) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m3.x < v01.z)); - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9 - * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11 - * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 - * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 + * r=0, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - // NOP - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); + /** + * r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - // NOP - // NOP + /** + * r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=0, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); @@ -247,86 +271,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10 - * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8 - * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15 - * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6 + * r=1, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m4) + vec4(Z, 0u, u32(v01.z + m4.x < v01.z)); - // NOP - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12 - * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2 - * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7 - * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3 + * r=1, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(m1, m0) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m0.x < v01.z)); - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - v01 += vec4(Z, m2) + vec4(Z, 0u, u32(v01.z + m2.x < v01.z)); - v23 += vec4(Z, m3) + vec4(Z, 0u, u32(v23.z + m3.x < v23.z)); + /** + * r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=1, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -337,86 +380,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=11, m[sigma+1]=8 - * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=12, m[sigma+1]=0 - * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=5, m[sigma+1]=2 - * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=15, m[sigma+1]=13 + * r=2, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - // NOP - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m0) + vec4(Z, 0u, u32(v01.z + m0.x < v01.z)); - v23 += vec4(m2, Z) + vec4(0u, u32(v23.x + m2.x < v23.x), Z); - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=10, m[sigma+1]=14 - * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=3, m[sigma+1]=6 - * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=7, m[sigma+1]=1 - * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=9, m[sigma+1]=4 + * r=2, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + m3.x < v01.z)); - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m0 + vec2(0u, u32(v1.x + m0.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - v23 += vec4(m1, m4) + vec4(0u, u32(v23.x + m1.x < v23.x), 0u, u32(v23.z + m4.x < v23.z)); + /** + * r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=2, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -427,86 +489,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=7, m[sigma+1]=9 - * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=3, m[sigma+1]=1 - * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=13, m[sigma+1]=12 - * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=11, m[sigma+1]=14 + * r=3, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + m3.x < v01.z)); - // NOP - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + m1.x < v01.z)); - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=2, m[sigma+1]=6 - * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=5, m[sigma+1]=10 - * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=4, m[sigma+1]=0 - * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=15, m[sigma+1]=8 + * r=3, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + m2.x < v01.x), Z); - v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + m0.x < v23.x), Z); + /** + * r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=3, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -517,89 +598,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=9, m[sigma+1]=0 - * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=5, m[sigma+1]=7 - * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=2, m[sigma+1]=4 - * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=10, m[sigma+1]=15 + * r=4, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(m2, Z) + vec4(0u, u32(v23.x + m2.x < v23.x), Z); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + m0.x < v01.x), Z); - v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** + * r=4, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); /** - * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=14, m[sigma+1]=1 - * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=11, m[sigma+1]=12 - * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=6, m[sigma+1]=8 - * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=3, m[sigma+1]=13 + * r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - v23 += vec4(Z, m3) + vec4(Z, 0u, u32(v23.z + m3.x < v23.z)); - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(m1, Z) + vec4(0u, u32(v01.x + m1.x < v01.x), Z); - // NOP + /** + * r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=4, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=4, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -610,86 +707,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=2, m[sigma+1]=12 - * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=6, m[sigma+1]=10 - * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=0, m[sigma+1]=11 - * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=8, m[sigma+1]=3 + * r=5, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + m2.x < v01.x), Z); - v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + m0.x < v23.x), Z); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(Z, m3) + vec4(Z, 0u, u32(v23.z + m3.x < v23.z)); - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=4, m[sigma+1]=13 - * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=7, m[sigma+1]=5 - * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=15, m[sigma+1]=14 - * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=1, m[sigma+1]=9 + * r=5, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v01 += vec4(m4, Z) + vec4(0u, u32(v01.x + m4.x < v01.x), Z); - v23 += vec4(Z, m1) + vec4(Z, 0u, u32(v23.z + m1.x < v23.z)); - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m0 + vec2(0u, u32(v2.x + m0.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - // NOP + /** + * r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m4 + vec2(0u, u32(v0.x + m4.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=5, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -700,86 +816,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=12, m[sigma+1]=5 - * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=1, m[sigma+1]=15 - * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=14, m[sigma+1]=13 - * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=4, m[sigma+1]=10 + * r=6, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + m1.x < v01.z)); - v23 += vec4(Z, m4) + vec4(Z, 0u, u32(v23.z + m4.x < v23.z)); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=0, m[sigma+1]=7 - * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=6, m[sigma+1]=3 - * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=9, m[sigma+1]=2 - * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=8, m[sigma+1]=11 + * r=6, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + m0.x < v01.x), Z); - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m1 + vec2(0u, u32(v1.x + m1.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + m3.x < v01.z)); - v23 += vec4(m2, Z) + vec4(0u, u32(v23.x + m2.x < v23.x), Z); + /** + * r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v3 = v3 + m4 + vec2(0u, u32(v3.x + m4.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=6, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m2 + vec2(0u, u32(v2.x + m2.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -790,86 +925,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=13, m[sigma+1]=11 - * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=7, m[sigma+1]=14 - * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=12, m[sigma+1]=1 - * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=3, m[sigma+1]=9 + * r=7, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(Z, m3) + vec4(Z, 0u, u32(v23.z + m3.x < v23.z)); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + m1.x < v23.x), Z); - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=5, m[sigma+1]=0 - * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=15, m[sigma+1]=4 - * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=8, m[sigma+1]=6 - * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=2, m[sigma+1]=10 + * r=7, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - // NOP - v23 += vec4(Z, m2) + vec4(Z, 0u, u32(v23.z + m2.x < v23.z)); - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - v01 += vec4(m0, m4) + vec4(0u, u32(v01.x + m0.x < v01.x), 0u, u32(v01.z + m4.x < v01.z)); - // NOP + /** + * r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v3 = v3 + m3 + vec2(0u, u32(v3.x + m3.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=7, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m2 + vec2(0u, u32(v3.x + m2.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -880,86 +1034,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=6, m[sigma+1]=15 - * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=14, m[sigma+1]=9 - * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=11, m[sigma+1]=3 - * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=0, m[sigma+1]=8 + * r=8, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(Z, m0) + vec4(Z, 0u, u32(v23.z + m0.x < v23.z)); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(m3, Z) + vec4(0u, u32(v23.x + m3.x < v23.x), Z); - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=12, m[sigma+1]=2 - * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=13, m[sigma+1]=7 - * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=1, m[sigma+1]=4 - * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=10, m[sigma+1]=5 + * r=8, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - // NOP - v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + m1.x < v23.x), Z); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + m2.x < v01.x), Z); - v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); + /** + * r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=8, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m1 + vec2(0u, u32(v2.x + m1.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -970,86 +1143,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=10, m[sigma+1]=2 - * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=8, m[sigma+1]=4 - * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=7, m[sigma+1]=6 - * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=1, m[sigma+1]=5 + * r=9, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - v23 += vec4(Z, m1) + vec4(Z, 0u, u32(v23.z + m1.x < v23.z)); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m2, m4) + vec4(0u, u32(v01.x + m2.x < v01.x), 0u, u32(v01.z + m4.x < v01.z)); - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m2 + vec2(0u, u32(v0.x + m2.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=15, m[sigma+1]=11 - * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=9, m[sigma+1]=14 - * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=3, m[sigma+1]=12 - * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=13, m[sigma+1]=0 + * r=9, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - // NOP - v23 += vec4(m3, Z) + vec4(0u, u32(v23.x + m3.x < v23.x), Z); - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - v23 += vec4(Z, m0) + vec4(Z, 0u, u32(v23.z + m0.x < v23.z)); + /** + * r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v3 = v3 + m1 + vec2(0u, u32(v3.x + m1.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=9, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v2 = v2 + m3 + vec2(0u, u32(v2.x + m3.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v3 = v3 + m0 + vec2(0u, u32(v3.x + m0.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -1060,86 +1252,105 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=0, m[sigma+1]=1 - * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=2, m[sigma+1]=3 - * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5 - * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7 + * r=10, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m0, m2) + vec4(0u, u32(v01.x + m0.x < v01.x), 0u, u32(v01.z + m2.x < v01.z)); - v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + m4.x < v23.x), Z); - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(m1, m3) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m3.x < v01.z)); - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m0 + vec2(0u, u32(v0.x + m0.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy << ROTATE_1) | ((v4 ^ v8).yx >> ROTATE_31); /** - * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9 - * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=10, m[sigma+1]=11 - * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13 - * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15 + * r=10, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - // NOP - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; - - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m2 + vec2(0u, u32(v1.x + m2.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m3 + vec2(0u, u32(v1.x + m3.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v2 = v2 + m4 + vec2(0u, u32(v2.x + m4.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy << ROTATE_1) | ((v6 ^ v10).yx >> ROTATE_31); - // NOP - // NOP + /** + * r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v15 = ((v15 ^ v0).xy >> ROTATE_16) | ((v15 ^ v0).yx << ROTATE_16); + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy << ROTATE_1) | ((v5 ^ v10).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=10, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = v12.yx ^ v1.yx; + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy >> ROTATE_24) | ((v6 ^ v11).yx << ROTATE_8); + v1 = v1 + v6 + vec2(0u, u32(v1.x + v6.x < v1.x)); + v12 = ((v12 ^ v1).xy >> ROTATE_16) | ((v12 ^ v1).yx << ROTATE_16); + v11 = v11 + v12 + vec2(0u, u32(v11.x + v12.x < v11.x)); + v6 = ((v6 ^ v11).xy << ROTATE_1) | ((v6 ^ v11).yx >> ROTATE_31); - v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy << ROTATE_1) | ((v7 ^ v8).yx >> ROTATE_31); - v45 = vec4(v74.zw, v56.xy); - v67 = vec4(v56.zw, v74.xy); - vCD = vec4(vFC.zw, vDE.xy); - vEF = vec4(vDE.zw, vFC.xy); + /** + * r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = v14.yx ^ v3.yx; + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy >> ROTATE_24) | ((v4 ^ v9).yx << ROTATE_8); + v3 = v3 + v4 + vec2(0u, u32(v3.x + v4.x < v3.x)); + v14 = ((v14 ^ v3).xy >> ROTATE_16) | ((v14 ^ v3).yx << ROTATE_16); + v9 = v9 + v14 + vec2(0u, u32(v9.x + v14.x < v9.x)); + v4 = ((v4 ^ v9).xy << ROTATE_1) | ((v4 ^ v9).yx >> ROTATE_31); @@ -1150,86 +1361,88 @@ fn main(id: vec3) { ****************************************************************************/ /** - * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12], m[sigma]=14, m[sigma+1]=10 - * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13], m[sigma]=4, m[sigma+1]=8 - * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15 - * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6 + * r=11, i=0, a=v[0], b=v[4], c=v[8], d=v[12] */ - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - v01 += vec4(Z, m4) + vec4(Z, 0u, u32(v01.z + m4.x < v01.z)); - // NOP - - vCD = (vCD ^ v01).yxwz; - vEF = (vEF ^ v23).yxwz; - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) >> ROTATE_24) | ((v45 ^ v89).yxwz << ROTATE_8); - v67 = ((v67 ^ vAB) >> ROTATE_24) | ((v67 ^ vAB).yxwz << ROTATE_8); - - v01 += v45 + vec4(0u, u32(v01.x + v45.x < v01.x), 0u, u32(v01.z + v45.z < v01.z)); - v23 += v67 + vec4(0u, u32(v23.x + v67.x < v23.x), 0u, u32(v23.z + v67.z < v23.z)); - - // NOP - // NOP - - vCD = ((vCD ^ v01) >> ROTATE_16) | ((vCD ^ v01).yxwz << ROTATE_16); - vEF = ((vEF ^ v23) >> ROTATE_16) | ((vEF ^ v23).yxwz << ROTATE_16); - - v89 += vCD + vec4(0u, u32(v89.x + vCD.x < v89.x), 0u, u32(v89.z + vCD.z < v89.z)); - vAB += vEF + vec4(0u, u32(vAB.x + vEF.x < vAB.x), 0u, u32(vAB.z + vEF.z < vAB.z)); - - v45 = ((v45 ^ v89) << ROTATE_1) | ((v45 ^ v89).yxwz >> ROTATE_31); - v67 = ((v67 ^ vAB) << ROTATE_1) | ((v67 ^ vAB).yxwz >> ROTATE_31); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = v12.yx ^ v0.yx; + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + v4 = ((v4 ^ v8).xy >> ROTATE_24) | ((v4 ^ v8).yx << ROTATE_8); + v0 = v0 + v4 + vec2(0u, u32(v0.x + v4.x < v0.x)); + v12 = ((v12 ^ v0).xy >> ROTATE_16) | ((v12 ^ v0).yx << ROTATE_16); + v8 = v8 + v12 + vec2(0u, u32(v8.x + v12.x < v8.x)); + // skip since it does not affect the final values of `v0` and `v8` /** - * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12 - * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12], m[sigma]=0, m[sigma+1]=2 - * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7 - * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3 + * r=11, i=1, a=v[1], b=v[5], c=v[9], d=v[13] */ - v56 = vec4(v45.zw, v67.xy); - vFC = vec4(vEF.zw, vCD.xy); - v74 = vec4(v67.zw, v45.xy); - vDE = vec4(vCD.zw, vEF.xy); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); - - v01 += vec4(m1, m0) + vec4(0u, u32(v01.x + m1.x < v01.x), 0u, u32(v01.z + m0.x < v01.z)); - // NOP - - vFC = (vFC ^ v01).yxwz; - vDE = (vDE ^ v23).yxwz; + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v1 = v1 + m4 + vec2(0u, u32(v1.x + m4.x < v1.x)); + v13 = v13.yx ^ v1.yx; + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy >> ROTATE_24) | ((v5 ^ v9).yx << ROTATE_8); + v1 = v1 + v5 + vec2(0u, u32(v1.x + v5.x < v1.x)); + v13 = ((v13 ^ v1).xy >> ROTATE_16) | ((v13 ^ v1).yx << ROTATE_16); + v9 = v9 + v13 + vec2(0u, u32(v9.x + v13.x < v9.x)); + v5 = ((v5 ^ v9).xy << ROTATE_1) | ((v5 ^ v9).yx >> ROTATE_31); - vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); - - v56 = ((v56 ^ vAB) >> ROTATE_24) | ((v56 ^ vAB).yxwz << ROTATE_8); - v74 = ((v74 ^ v89) >> ROTATE_24) | ((v74 ^ v89).yxwz << ROTATE_8); - - v01 += v56 + vec4(0u, u32(v01.x + v56.x < v01.x), 0u, u32(v01.z + v56.z < v01.z)); - v23 += v74 + vec4(0u, u32(v23.x + v74.x < v23.x), 0u, u32(v23.z + v74.z < v23.z)); + /** + * r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14] + */ + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = v14.yx ^ v2.yx; + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + v6 = ((v6 ^ v10).xy >> ROTATE_24) | ((v6 ^ v10).yx << ROTATE_8); + v2 = v2 + v6 + vec2(0u, u32(v2.x + v6.x < v2.x)); + v14 = ((v14 ^ v2).xy >> ROTATE_16) | ((v14 ^ v2).yx << ROTATE_16); + v10 = v10 + v14 + vec2(0u, u32(v10.x + v14.x < v10.x)); + // skip since it does not affect the final values of `v0` and `v8` - v01 += vec4(Z, m2) + vec4(Z, 0u, u32(v01.z + m2.x < v01.z)); - v23 += vec4(Z, m3) + vec4(Z, 0u, u32(v23.z + m3.x < v23.z)); + /** + * r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15] + */ + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = v15.yx ^ v3.yx; + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy >> ROTATE_24) | ((v7 ^ v11).yx << ROTATE_8); + v3 = v3 + v7 + vec2(0u, u32(v3.x + v7.x < v3.x)); + v15 = ((v15 ^ v3).xy >> ROTATE_16) | ((v15 ^ v3).yx << ROTATE_16); + v11 = v11 + v15 + vec2(0u, u32(v11.x + v15.x < v11.x)); + v7 = ((v7 ^ v11).xy << ROTATE_1) | ((v7 ^ v11).yx >> ROTATE_31); - // vFC = ((vFC ^ v01) >> ROTATE_16) | ((vFC ^ v01).yxwz << ROTATE_16); - vDE = ((vDE ^ v23) >> ROTATE_16) | ((vDE ^ v23).yxwz << ROTATE_16); + /** + * r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15] + */ + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + v0 = v0 + m1 + vec2(0u, u32(v0.x + m1.x < v0.x)); + v15 = v15.yx ^ v0.yx; + v10 = v10 + v15 + vec2(0u, u32(v10.x + v15.x < v10.x)); + v5 = ((v5 ^ v10).xy >> ROTATE_24) | ((v5 ^ v10).yx << ROTATE_8); + v0 = v0 + v5 + vec2(0u, u32(v0.x + v5.x < v0.x)); + // skip since it does not affect the final values of `v0` and `v8` + // skip since it does not affect the final values of `v0` and `v8` + // skip since it does not affect the final values of `v0` and `v8` - // vAB += vFC + vec4(0u, u32(vAB.x + vFC.x < vAB.x), 0u, u32(vAB.z + vFC.z < vAB.z)); - v89 += vDE + vec4(0u, u32(v89.x + vDE.x < v89.x), 0u, u32(v89.z + vDE.z < v89.z)); + /** + * r=11, i=5, a=v[1], b=v[6], c=v[11], d=v[12] + */ + // skip entire step since it does not affect the final values of `v0` and `v8` - // v56 = ((v56 ^ vAB) << ROTATE_1) | ((v56 ^ vAB).yxwz >> ROTATE_31); - // v74 = ((v74 ^ v89) << ROTATE_1) | ((v74 ^ v89).yxwz >> ROTATE_31); + /** + * r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13] + */ + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = v13.yx ^ v2.yx; + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + v7 = ((v7 ^ v8).xy >> ROTATE_24) | ((v7 ^ v8).yx << ROTATE_8); + v2 = v2 + v7 + vec2(0u, u32(v2.x + v7.x < v2.x)); + v13 = ((v13 ^ v2).xy >> ROTATE_16) | ((v13 ^ v2).yx << ROTATE_16); + v8 = v8 + v13 + vec2(0u, u32(v8.x + v13.x < v8.x)); + // skip since we already have the final values of `v0` and `v8` - // v45 = vec4(v74.zw, v56.xy); - // v67 = vec4(v56.zw, v74.xy); - // vCD = vec4(vFC.zw, vDE.xy); - // vEF = vec4(vDE.zw, vFC.xy); + /** + * r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14] + */ + // skip entire step since we already have the final values of `v0` and `v8` @@ -1241,11 +1454,9 @@ fn main(id: vec3) { /** * Set nonce if it passes the threshold and no other thread has set it. - * Numeric literal used in the finalization digest is the original value of the - * first element of the initialization vector `blake2b_IV[0]` which in NanoPow - * is initialized at vector component `v01.y`. + * Only high bits are needed for comparison since threshold low bits are zero. */ - if ((0x6A09E667u ^ v01.y ^ v89.y) > ubo.threshold) { + if ((BLAKE2B_IV_0.y ^ v0.y ^ v8.y) > ubo.threshold) { let wasFound: u32 = atomicExchange(&work.found, 1u); if (wasFound == 0u) { work.nonce = m0;