* 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from
* UBO. High 4 bytes are the random value XOR'd with index of each thread.
*/
- @compute @workgroup_size(256)
+ @compute @workgroup_size(64)
fn main(
@builtin(workgroup_id) workgroup_id: vec3<u32>,
@builtin(local_invocation_id) local_id: vec3<u32>
if (atomicLoad(&work.found) != 0u) {
return;
}
+
/**
* Flatten 3D workgroup and local identifiers into u32 for each thread
*/
((workgroup_id.y & 0xff) << 16) |
((workgroup_id.z & 0xff) << 8) |
(local_id.x & 0xff);
+
+ /**
+ * Initialize (nonce||blockhash) concatenation
+ */
var m: array<u32, 16>;
m[0u] = ubo.random;
- m[1u] = id ^ ubo.random;
+ m[1u] = ubo.random ^ id;
m[2u] = ubo.blockhash[0u].x;
m[3u] = ubo.blockhash[0u].y;
m[4u] = ubo.blockhash[0u].z;
0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
);
- /**
- * Twelve rounds of mixing as part of compression step
- */
+ for (var i: u32 = 0u; i < 0xFFFFu; i = i + 1u) {
+ /**
+ * Twelve rounds of mixing as part of compression step
+ */
for (var r: u32 = 0u; r < 12u; r = r + 1u) {
- G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
- G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
- G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
- G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
- G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
- G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
- G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
- G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
- }
+ G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
+ G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
+ G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
+ G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
+ G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
+ G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
+ G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
+ G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
+ }
- /**
- * Set nonce if it passes the threshold and no other thread has set it
- */
- if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
- atomicStore(&work.found, 1u);
- work.nonce.x = m[0];
- work.nonce.y = m[1];
- return;
+ /**
+ * Set nonce if it passes the threshold and no other thread has set it
+ */
+ if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
+ atomicStore(&work.found, 1u);
+ work.nonce.x = m[0];
+ work.nonce.y = m[1];
+ return;
+ }
+
+ /**
+ * If not found, move on to the next nonce candidate.
+ * For example, increment by the total number of *active* threads,
+ * or simply increment by 1.
+ */
+ m[0u] = m[0u] + 1u;
}
- /**
- * Nonce not found in this execution context
- */
+ // No valid nonce found by this thread in 'maxIterations' attempts
+ // Exit. Another dispatch or CPU fallback might handle continuing.
return;
}
`;