BLAKE2B_IV[7u]
);
+fn G (
+ a: ptr<function, vec2<u32>>,
+ b: ptr<function, vec2<u32>>,
+ c: ptr<function, vec2<u32>>,
+ d: ptr<function, vec2<u32>>,
+ m0: vec2<u32>, m1: vec2<u32>
+) {
+ *a += *b;
+ (*a).y += u32((*a).x < (*b).x);
+ *a += m0;
+ (*a).y += u32((*a).x < m0.x);
+
+ *d = (*d ^ *a).yx;
+
+ *c += *d;
+ (*c).y += u32((*c).x < (*d).x);
+
+ *b ^= *c;
+ *b = (*b >> vec2(24u)) | (*b << vec2(8u)).yx;
+
+ *a += *b;
+ (*a).y += u32((*a).x < (*b).x);
+ *a += m1;
+ (*a).y += u32((*a).x < m1.x);
+
+ *d ^= *a;
+ *d = (*d >> vec2(16u)) | (*d << vec2(16u)).yx;
+
+ *c += *d;
+ (*c).y += u32((*c).x < (*d).x);
+
+ *b ^= *c;
+ *b = (*b >> vec2(31u)).yx | (*b << vec2(1u));
+}
+
/**
* Used to fill partial `m` vec4 constructions.
*/
const Z = vec2(0u);
-/**
-* Used to apply boolean mask to swizzled result of carry bit comparison.
-*/
-const CARRY = vec4(1u, 0u, 1u, 0u);
-
-/**
-* Used to rotate bits by a fixed amount during G mixing.
-*/
-const ROTATE_1 = vec4(1u);
-const ROTATE_8 = vec4(8u);
-const ROTATE_16 = vec4(16u);
-const ROTATE_24 = vec4(24u);
-const ROTATE_31 = vec4(31u);
-
/**
* Shared flag to prevent execution for all workgroup threads based on the
* atomicLoad() result of a single member thread.
*/
var<workgroup> found: bool;
+/**
+* Shared memory for seed and blockhash which do not change during execution and
+* are eventually concatenated to form the hash input message.
+*/
+var<workgroup> seed: vec2<u32>;
+var<workgroup> m1: vec2<u32>;
+var<workgroup> m2: vec2<u32>;
+var<workgroup> m3: vec2<u32>;
+var<workgroup> m4: vec2<u32>;
+
/**
* Search compute function
-* Calls main with a workgroup size of 64 which is generally considered optimal
-* due to how warps and wavefronts are executed on modern GPUs. The entire
-* workgroup exits immediately if a nonce was already found by a previous
-* workgroup.
+* Calls main with a workgroup size of 96 which in testing was the lowest value
+* that would saturate the GPU active thread count and warp occupancy which
+* provides a decent balance with the power-sensitive requirements of mobile
+* devices. The entire workgroup exits immediately if a nonce was already found
+* by a previous workgroup.
*/
-@compute @workgroup_size(32)
+@compute @workgroup_size(96)
fn search(@builtin(global_invocation_id) global_id: vec3<u32>, @builtin(local_invocation_id) local_id: vec3<u32>) {
- found = (local_id.x == 0u && atomicLoad(&work.found) != 0u);
+ if (local_id.x == 0u) {
+ found = atomicLoad(&work.found) != 0u;
+ seed = ubo.seed;
+ m1 = ubo.blockhash[0u].xy;
+ m2 = ubo.blockhash[0u].zw;
+ m3 = ubo.blockhash[1u].xy;
+ m4 = ubo.blockhash[1u].zw;
+ }
workgroupBarrier();
if (found) { return; }
main(global_id, false);
/**
* Validate compute function
-* Calls main with a workgroup size of 1 so that only one value is tested
+* Calls main with a workgroup size of 1 so that only one value is tested.
*/
@compute @workgroup_size(1)
fn validate(@builtin(global_invocation_id) global_id: vec3<u32>) {
+ seed = ubo.seed;
+ m1 = ubo.blockhash[0u].xy;
+ m2 = ubo.blockhash[0u].zw;
+ m3 = ubo.blockhash[1u].xy;
+ m4 = ubo.blockhash[1u].zw;
main(global_id, true);
}
/**
* Main compute function
-* A random u32 provided by the UBO is copied to form a pair. Each component of
-* this 8-byte value is then XOR'd with a different dimensional index from
-* the thread identifier.
+* Each component of a random 8-byte value, provided by the UBO as a vec2<u32>,
+* is XOR'd with a different dimensional index from the global thread identifier
+* to create a unique nonce value for each thread.
*
* Where the reference implementation uses array lookups, the NanoPow
* implementation assigns each array element to its own variable to enhance
*/
fn main(id: vec3<u32>, validate: bool) {
/**
- * Initialize (nonce||blockhash) concatenation
+ * Initialize unique nonce
*/
- let m0: vec2<u32> = ubo.seed ^ id.xy;
- let m1: vec2<u32> = ubo.blockhash[0u].xy;
- let m2: vec2<u32> = ubo.blockhash[0u].zw;
- let m3: vec2<u32> = ubo.blockhash[1u].xy;
- let m4: vec2<u32> = ubo.blockhash[1u].zw;
+ let m0: vec2<u32> = seed ^ id.xy;
/**
* Compression buffer copied from the modified initialization vector.
*/
- var v01: vec4<u32> = vec4<u32>(BLAKE2B_INIT[0u], BLAKE2B_INIT[1u]);
- var v23: vec4<u32> = vec4<u32>(BLAKE2B_INIT[2u], BLAKE2B_INIT[3u]);
- var v45: vec4<u32> = vec4<u32>(BLAKE2B_INIT[4u], BLAKE2B_INIT[5u]);
- var v67: vec4<u32> = vec4<u32>(BLAKE2B_INIT[6u], BLAKE2B_INIT[7u]);
- var v89: vec4<u32> = vec4<u32>(BLAKE2B_INIT[8u], BLAKE2B_INIT[9u]);
- var vAB: vec4<u32> = vec4<u32>(BLAKE2B_INIT[10u], BLAKE2B_INIT[11u]);
- var vCD: vec4<u32> = vec4<u32>(BLAKE2B_INIT[12u], BLAKE2B_INIT[13u]);
- var vEF: vec4<u32> = vec4<u32>(BLAKE2B_INIT[14u], BLAKE2B_INIT[15u]);
-
- /**
- * Temporary variables used for subprocesses i=4 through i=7
- */
- var v56: vec4<u32>;
- var vFC: vec4<u32>;
- var v74: vec4<u32>;
- var vDE: vec4<u32>;
+ var v0: vec2<u32> = vec2<u32>(BLAKE2B_INIT[0u]);
+ var v1: vec2<u32> = vec2<u32>(BLAKE2B_INIT[1u]);
+ var v2: vec2<u32> = vec2<u32>(BLAKE2B_INIT[2u]);
+ var v3: vec2<u32> = vec2<u32>(BLAKE2B_INIT[3u]);
+ var v4: vec2<u32> = vec2<u32>(BLAKE2B_INIT[4u]);
+ var v5: vec2<u32> = vec2<u32>(BLAKE2B_INIT[5u]);
+ var v6: vec2<u32> = vec2<u32>(BLAKE2B_INIT[6u]);
+ var v7: vec2<u32> = vec2<u32>(BLAKE2B_INIT[7u]);
+ var v8: vec2<u32> = vec2<u32>(BLAKE2B_INIT[8u]);
+ var v9: vec2<u32> = vec2<u32>(BLAKE2B_INIT[9u]);
+ var vA: vec2<u32> = vec2<u32>(BLAKE2B_INIT[10u]);
+ var vB: vec2<u32> = vec2<u32>(BLAKE2B_INIT[11u]);
+ var vC: vec2<u32> = vec2<u32>(BLAKE2B_INIT[12u]);
+ var vD: vec2<u32> = vec2<u32>(BLAKE2B_INIT[13u]);
+ var vE: vec2<u32> = vec2<u32>(BLAKE2B_INIT[14u]);
+ var vF: vec2<u32> = vec2<u32>(BLAKE2B_INIT[15u]);
/**
* Twelve rounds of G mixing as part of BLAKE2b compression step, each divided
- * into eight subprocesses. Each statement of the first four subprocesses is
- * executed in sequence so that the compiler can interleave independent
- * instructions for improved scheduling. That is to say, the first statement
- * `a = a + b` is executed for subprocesses 1-4, and then the next statement
- * `a = a + m[sigma[r][2*i+0]]` is executed, and so on through all the steps of
- * the mixing function. Once subprocesses 1-4 are done, computation on
- * subprocesses 5-8 are executed in the same manner.
+ * into eight subprocesses, each of which is further paired to be processed in
+ * parallel by packing independent vec2 variables into vec4 variables.
+ * Each subprocess statement execution is alternated so that the compiler can
+ * interleave independent instructions for improved scheduling. That is to say,
+ * the first statement `a = a + b` is executed for each subprocess, and then
+ * the next statement `a = a + m[sigma[r][2*i+0]]` is executed, and so on
+ * through all the steps of the G mix function. Once subprocesses 1-4 are done,
+ * computation on subprocesses 5-8 are executed in the same manner.
*
* Each subprocess applies transformations to `m` and `v` variables based on a
* defined set of index inputs. The algorithm for each subprocess is defined as
* b = rotr64(b ^ c, 63)
*
* Each sum step has an extra carry addition. Note that the m[sigma] sum is
- * skipped if m[sigma] is zero since it effectively does nothing.
+ * skipped if m[sigma] is zero since it effectively does nothing. Also note
+ * that rotations must be applied differently from the reference implementation
+ * due to the lack of both a native rotate function and 64-bit support in WGSL.
*/
- var s0: vec4<u32>;
- var s1: vec4<u32>;
/****************************************************************************
* ROUND(0) *
* r=0, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5
* r=0, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m0, m2);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- v23 += vec4(m4, Z);
- v23.y += u32(v23.x < m4.x);
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m1, m3);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, m0, m1);
+ G(&v1, &v5, &v9, &vD, m2, m3);
+ G(&v2, &v6, &vA, &vE, m4, Z);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=0, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9
* r=0, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13
* r=0, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, Z);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, Z, Z);
/****************************************************************************
* ROUND(1) *
* r=1, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15
* r=1, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m4);
- v01.w += u32(v01.z < m4.x);
- // NOP
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, m4, Z);
+ G(&v2, &v6, &vA, &vE, Z, Z);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=1, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12
* r=1, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7
* r=1, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m1, m0);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m2);
- v01.w += u32(v01.z < m2.x);
- v23 += vec4(Z, m3);
- v23.w += u32(v23.z < m3.x);
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, m1, Z);
+ G(&v1, &v6, &vB, &vC, m0, m2);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, Z, m3);
/****************************************************************************
* ROUND(2) *
* r=2, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=5, m[sigma+1]=2
* r=2, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=15, m[sigma+1]=13
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(Z, m0);
- v01= s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + vec4(m2, Z);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, Z, m0);
+ G(&v2, &v6, &vA, &vE, Z, m2);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=2, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=10, m[sigma+1]=14
* r=2, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=7, m[sigma+1]=1
* r=2, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=9, m[sigma+1]=4
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
- v01.z += m3.x;
- v01.w += m3.y + u32(v01.z < m3.x);
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m1, m4) + vec4(0u, u32(v23.x + vec4(m1, m4).x < v23.x), 0u, u32(v23.z + vec4(m1, m4).z < v23.z));
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, Z);
+ G(&v1, &v6, &vB, &vC, m3, Z);
+ G(&v2, &v7, &v8, &vD, Z, m1);
+ G(&v3, &v4, &v9, &vE, Z, m4);
/****************************************************************************
* ROUND(3) *
* r=3, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=13, m[sigma+1]=12
* r=3, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=11, m[sigma+1]=14
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
- // NOP
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z));
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, m3, m1);
+ G(&v2, &v6, &vA, &vE, Z, Z);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=3, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=2, m[sigma+1]=6
* r=3, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=4, m[sigma+1]=0
* r=3, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=15, m[sigma+1]=8
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
- v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z);
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, m2, Z);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, m4, m0);
+ G(&v3, &v4, &v9, &vE, Z, Z);
/****************************************************************************
* ROUND(4) *
* r=4, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=2, m[sigma+1]=4
* r=4, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=10, m[sigma+1]=15
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- s1 = v23 + vec4(m2, Z);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z);
- v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, m0);
+ G(&v1, &v5, &v9, &vD, Z, Z);
+ G(&v2, &v6, &vA, &vE, m2, m4);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
*/
* r=4, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=6, m[sigma+1]=8
* r=4, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=3, m[sigma+1]=13
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- s1 = v23 + vec4(Z, m3);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m1, Z) + vec4(0u, u32(v01.x + vec4(m1, Z).x < v01.x), Z);
- // NOP
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, m1);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, m3, Z);
/****************************************************************************
* ROUND(5) *
* r=5, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=0, m[sigma+1]=11
* r=5, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=8, m[sigma+1]=3
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
- v23 += vec4(m0, Z) + vec4(0u, u32(v23.x + vec4(m0, Z).x < v23.x), Z);
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- s1 = v23 + vec4(Z, m3);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, m2, Z);
+ G(&v1, &v5, &v9, &vD, Z, Z);
+ G(&v2, &v6, &vA, &vE, m0, Z);
+ G(&v3, &v7, &vB, &vF, Z, m3);
/**
* r=5, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=4, m[sigma+1]=13
* r=5, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=15, m[sigma+1]=14
* r=5, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=1, m[sigma+1]=9
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m4, Z) + vec4(0u, u32(v01.x + vec4(m4, Z).x < v01.x), Z);
- v23 += vec4(Z, m1) + vec4(Z, 0u, u32(v23.z + vec4(Z, m1).z < v23.z));
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, m4, Z);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, m1, Z);
/****************************************************************************
* ROUND(6) *
* r=6, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=14, m[sigma+1]=13
* r=6, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=4, m[sigma+1]=10
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m1) + vec4(Z, 0u, u32(v01.z + vec4(Z, m1).z < v01.z));
- v23 += vec4(Z, m4) + vec4(Z, 0u, u32(v23.z + vec4(Z, m4).z < v23.z));
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, m1, Z);
+ G(&v2, &v6, &vA, &vE, Z, Z);
+ G(&v3, &v7, &vB, &vF, m4, Z);
/**
* r=6, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=0, m[sigma+1]=7
* r=6, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=9, m[sigma+1]=2
* r=6, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=8, m[sigma+1]=11
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m0, Z) + vec4(0u, u32(v01.x + vec4(m0, Z).x < v01.x), Z);
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m3) + vec4(Z, 0u, u32(v01.z + vec4(Z, m3).z < v01.z));
- s1 = v23 + vec4(m2, Z);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, m0, Z);
+ G(&v1, &v6, &vB, &vC, Z, m3);
+ G(&v2, &v7, &v8, &vD, Z, m2);
+ G(&v3, &v4, &v9, &vE, Z, Z);
/****************************************************************************
* ROUND(7) *
* r=7, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=12, m[sigma+1]=1
* r=7, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=3, m[sigma+1]=9
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- s1 = v23 + vec4(Z, m3);
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z);
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, Z, Z);
+ G(&v2, &v6, &vA, &vE, Z, m1);
+ G(&v3, &v7, &vB, &vF, m3, Z);
/**
* r=7, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=5, m[sigma+1]=0
* r=7, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=8, m[sigma+1]=6
* r=7, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=2, m[sigma+1]=10
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(Z, m2) + vec4(Z, 0u, u32(v23.z + vec4(Z, m2).z < v23.z));
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m0, m4) + vec4(0u, u32(v01.x + vec4(m0, m4).x < v01.x), 0u, u32(v01.z + vec4(m0, m4).z < v01.z));
- // NOP
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, m0);
+ G(&v1, &v6, &vB, &vC, Z, m4);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, m2, Z);
/****************************************************************************
* ROUND(8) *
* r=8, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=11, m[sigma+1]=3
* r=8, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=0, m[sigma+1]=8
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(Z, m0);
- v23.w += u32(v23.z < m0.x);
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m3, Z) + vec4(0u, u32(v23.x + vec4(m3, Z).x < v23.x), Z);
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, Z, Z);
+ G(&v2, &v6, &vA, &vE, Z, m3);
+ G(&v3, &v7, &vB, &vF, m0, Z);
/**
* r=8, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=12, m[sigma+1]=2
* r=8, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=1, m[sigma+1]=4
* r=8, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=10, m[sigma+1]=5
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m1, Z) + vec4(0u, u32(v23.x + vec4(m1, Z).x < v23.x), Z);
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(m2, Z) + vec4(0u, u32(v01.x + vec4(m2, Z).x < v01.x), Z);
- v23 += vec4(m4, Z) + vec4(0u, u32(v23.x + vec4(m4, Z).x < v23.x), Z);
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, m2);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, m1, m4);
+ G(&v3, &v4, &v9, &vE, Z, Z);
/****************************************************************************
* ROUND(9) *
* r=9, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=7, m[sigma+1]=6
* r=9, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=1, m[sigma+1]=5
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(Z, m1);
- v23.w += u32(v23.z < m1.x);
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m2, m4);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, m2);
+ G(&v1, &v5, &v9, &vD, Z, m4);
+ G(&v2, &v6, &vA, &vE, Z, Z);
+ G(&v3, &v7, &vB, &vF, m1, Z);
/**
* r=9, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=15, m[sigma+1]=11
* r=9, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=3, m[sigma+1]=12
* r=9, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=13, m[sigma+1]=0
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(m3, Z);
- v23.y += u32(v23.x < m3.x);
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- v23 += vec4(Z, m0);
- v23.w += u32(v23.z < m0.x);
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, Z);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, m3, Z);
+ G(&v3, &v4, &v9, &vE, Z, m0);
/****************************************************************************
* ROUND(10) *
* r=10, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=4, m[sigma+1]=5
* r=10, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=6, m[sigma+1]=7
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m0, m2);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- v23 += vec4(m4, Z);
- v23.y += u32(v23.x < m4.x);
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m1, m3);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, m0, m1);
+ G(&v1, &v5, &v9, &vD, m2, m3);
+ G(&v2, &v6, &vA, &vE, m4, Z);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=10, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=8, m[sigma+1]=9
* r=10, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=12, m[sigma+1]=13
* r=10, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=14, m[sigma+1]=15
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vFC ^= v01;
- vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_31).yxwz | (v56 << ROTATE_1);
- v74 ^= v89;
- v74 = (v74 >> ROTATE_31).yxwz | (v74 << ROTATE_1);
-
- v45 = vec4(v74.zw, v56.xy);
- v67 = vec4(v56.zw, v74.xy);
- vCD = vec4(vFC.zw, vDE.xy);
- vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, Z, Z);
+ G(&v1, &v6, &vB, &vC, Z, Z);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ G(&v3, &v4, &v9, &vE, Z, Z);
/****************************************************************************
* ROUND(11) *
* r=11, i=2, a=v[2], b=v[6], c=v[10], d=v[14], m[sigma]=9, m[sigma+1]=15
* r=11, i=3, a=v[3], b=v[7], c=v[11], d=v[15], m[sigma]=13, m[sigma+1]=6
*/
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- v01 += vec4(Z, m4);
- v01.w += u32(v01.z < m4.x);
- // NOP
-
- vCD = (vCD ^ v01).yxwz;
- vEF = (vEF ^ v23).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_24) | (v45 << ROTATE_8).yxwz;
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_24) | (v67 << ROTATE_8).yxwz;
-
- s0 = v01 + v45;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v67;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // NOP
- // NOP
-
- vCD ^= v01;
- vCD = (vCD >> ROTATE_16) | (vCD << ROTATE_16).yxwz;
- vEF ^= v23;
- vEF = (vEF >> ROTATE_16) | (vEF << ROTATE_16).yxwz;
-
- s0 = v89 + vCD;
- v89 = s0 + (vec4<u32>(s0 < v89) & CARRY).yxwz;
- s1 = vAB + vEF;
- vAB = s1 + (vec4<u32>(s1 < vAB) & CARRY).yxwz;
-
- v45 ^= v89;
- v45 = (v45 >> ROTATE_31).yxwz | (v45 << ROTATE_1);
- v67 ^= vAB;
- v67 = (v67 >> ROTATE_31).yxwz | (v67 << ROTATE_1);
+ G(&v0, &v4, &v8, &vC, Z, Z);
+ G(&v1, &v5, &v9, &vD, m4, Z);
+ G(&v2, &v6, &vA, &vE, Z, Z);
+ G(&v3, &v7, &vB, &vF, Z, Z);
/**
* r=11, i=4, a=v[0], b=v[5], c=v[10], d=v[15], m[sigma]=1, m[sigma+1]=12
* r=11, i=6, a=v[2], b=v[7], c=v[8], d=v[13], m[sigma]=11, m[sigma+1]=7
* r=11, i=7, a=v[3], b=v[4], c=v[9], d=v[14], m[sigma]=5, m[sigma+1]=3
*/
- v56 = vec4(v45.zw, v67.xy);
- v74 = vec4(v67.zw, v45.xy);
- vFC = vec4(vEF.zw, vCD.xy);
- vDE = vec4(vCD.zw, vEF.xy);
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- s0 = v01 + vec4(m1, m0);
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- // NOP
-
- vFC = (vFC ^ v01).yxwz;
- vDE = (vDE ^ v23).yxwz;
-
- s0 = vAB + vFC;
- vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- v56 ^= vAB;
- v56 = (v56 >> ROTATE_24) | (v56 << ROTATE_8).yxwz;
- v74 ^= v89;
- v74 = (v74 >> ROTATE_24) | (v74 << ROTATE_8).yxwz;
-
- s0 = v01 + v56;
- v01 = s0 + (vec4<u32>(s0 < v01) & CARRY).yxwz;
- s1 = v23 + v74;
- v23 = s1 + (vec4<u32>(s1 < v23) & CARRY).yxwz;
-
- // v01 += vec4(Z, m2);
- // v01.w += u32(v01.z < m2.x);
- v23 += vec4(Z, m3);
- v23.w += u32(v23.z < m3.x);
-
- // vFC ^= v01;
- // vFC = (vFC >> ROTATE_16) | (vFC << ROTATE_16).yxwz;
- vDE ^= v23;
- vDE = (vDE >> ROTATE_16) | (vDE << ROTATE_16).yxwz;
-
- // s0 = vAB + vFC;
- // vAB = s0 + (vec4<u32>(s0 < vAB) & CARRY).yxwz;
- s1 = v89 + vDE;
- v89 = s1 + (vec4<u32>(s1 < v89) & CARRY).yxwz;
-
- // v56 ^= vAB;
- // v74 ^= v89;
- // v56 = (v56 << ROTATE_1) | (v56 >> ROTATE_31).yxwz;
- // v74 = (v74 << ROTATE_1) | (v74 >> ROTATE_31).yxwz;
-
- // v45 = vec4(v74.zw, v56.xy);
- // v67 = vec4(v56.zw, v74.xy);
- // vCD = vec4(vFC.zw, vDE.xy);
- // vEF = vec4(vDE.zw, vFC.xy);
-
-
-
-
+ G(&v0, &v5, &vA, &vF, m1, Z);
+ // G(&v1, &v6, &vB, &vC, m0, m2);
+ G(&v2, &v7, &v8, &vD, Z, Z);
+ // G(&v3, &v4, &v9, &vE, Z, m3);
/****************************************************************************
* NONCE CHECK *
/**
* Set nonce if it passes the difficulty threshold and no other thread has set it.
*/
- var result = BLAKE2B_INIT[0u] ^ v01.xy ^ v89.xy;
+ var result: vec2<u32> = BLAKE2B_INIT[0u] ^ v0 ^ v8;
if (validate || ((result.y > ubo.difficulty.y || (result.y == ubo.difficulty.y && result.y >= ubo.difficulty.y)) && atomicLoad(&work.found) == 0u)) {
atomicStore(&work.found, 1u);
work.nonce = m0;