From: Chris Duncan Date: Sun, 5 Jan 2025 05:57:29 +0000 (-0800) Subject: Decrease workgroup size and implement loop in shader main. Add some notes for potenti... X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=7f2eb5510131e16221bfb752d1f98c8d276b05f7;p=libnemo.git Decrease workgroup size and implement loop in shader main. Add some notes for potential changes later. --- diff --git a/src/lib/workers/powgpu.ts b/src/lib/workers/powgpu.ts index 04d0b02..e9e5901 100644 --- a/src/lib/workers/powgpu.ts +++ b/src/lib/workers/powgpu.ts @@ -145,7 +145,7 @@ export class PowGpu extends WorkerInterface { * 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from * UBO. High 4 bytes are the random value XOR'd with index of each thread. */ - @compute @workgroup_size(256) + @compute @workgroup_size(64) fn main( @builtin(workgroup_id) workgroup_id: vec3, @builtin(local_invocation_id) local_id: vec3 @@ -153,6 +153,7 @@ export class PowGpu extends WorkerInterface { if (atomicLoad(&work.found) != 0u) { return; } + /** * Flatten 3D workgroup and local identifiers into u32 for each thread */ @@ -160,9 +161,13 @@ export class PowGpu extends WorkerInterface { ((workgroup_id.y & 0xff) << 16) | ((workgroup_id.z & 0xff) << 8) | (local_id.x & 0xff); + + /** + * Initialize (nonce||blockhash) concatenation + */ var m: array; m[0u] = ubo.random; - m[1u] = id ^ ubo.random; + m[1u] = ubo.random ^ id; m[2u] = ubo.blockhash[0u].x; m[3u] = ubo.blockhash[0u].y; m[4u] = ubo.blockhash[0u].z; @@ -194,33 +199,41 @@ export class PowGpu extends WorkerInterface { 0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u ); - /** - * Twelve rounds of mixing as part of compression step - */ + for (var i: u32 = 0u; i < 0xFFFFu; i = i + 1u) { + /** + * Twelve rounds of mixing as part of compression step + */ for (var r: u32 = 0u; r < 12u; r = r + 1u) { - G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]); - G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]); - G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]); - G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]); - G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]); - G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]); - G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]); - G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]); - } + G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]); + G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]); + G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]); + G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]); + G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]); + G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]); + G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]); + G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]); + } - /** - * Set nonce if it passes the threshold and no other thread has set it - */ - if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) { - atomicStore(&work.found, 1u); - work.nonce.x = m[0]; - work.nonce.y = m[1]; - return; + /** + * Set nonce if it passes the threshold and no other thread has set it + */ + if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) { + atomicStore(&work.found, 1u); + work.nonce.x = m[0]; + work.nonce.y = m[1]; + return; + } + + /** + * If not found, move on to the next nonce candidate. + * For example, increment by the total number of *active* threads, + * or simply increment by 1. + */ + m[0u] = m[0u] + 1u; } - /** - * Nonce not found in this execution context - */ + // No valid nonce found by this thread in 'maxIterations' attempts + // Exit. Another dispatch or CPU fallback might handle continuing. return; } `;