]> zoso.dev Git - libnemo.git/commitdiff
Decrease workgroup size and implement loop in shader main. Add some notes for potenti...
authorChris Duncan <chris@zoso.dev>
Sun, 5 Jan 2025 05:57:29 +0000 (21:57 -0800)
committerChris Duncan <chris@zoso.dev>
Sun, 5 Jan 2025 05:57:29 +0000 (21:57 -0800)
src/lib/workers/powgpu.ts

index 04d0b02aced8fc16d08dd4e75666e67a6f6ebc08..e9e5901800b74f9cb99e0c9a5bcc09620d7c8fce 100644 (file)
@@ -145,7 +145,7 @@ export class PowGpu extends WorkerInterface {
                * 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from
                * UBO. High 4 bytes are the random value XOR'd with index of each thread.
                */
-               @compute @workgroup_size(256)
+               @compute @workgroup_size(64)
                fn main(
                        @builtin(workgroup_id) workgroup_id: vec3<u32>,
                        @builtin(local_invocation_id) local_id: vec3<u32>
@@ -153,6 +153,7 @@ export class PowGpu extends WorkerInterface {
                        if (atomicLoad(&work.found) != 0u) {
                                return;
                        }
+
                        /**
                        * Flatten 3D workgroup and local identifiers into u32 for each thread
                        */
@@ -160,9 +161,13 @@ export class PowGpu extends WorkerInterface {
                                ((workgroup_id.y & 0xff) << 16) |
                                ((workgroup_id.z & 0xff) << 8) |
                                (local_id.x & 0xff);
+
+                       /**
+                       * Initialize (nonce||blockhash) concatenation
+                       */
                        var m: array<u32, 16>;
                        m[0u] = ubo.random;
-                       m[1u] = id ^ ubo.random;
+                       m[1u] = ubo.random ^ id;
                        m[2u] = ubo.blockhash[0u].x;
                        m[3u] = ubo.blockhash[0u].y;
                        m[4u] = ubo.blockhash[0u].z;
@@ -194,33 +199,41 @@ export class PowGpu extends WorkerInterface {
                                0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
                        );
 
-                       /**
-                       * Twelve rounds of mixing as part of compression step
-                       */
+                       for (var i: u32 = 0u; i < 0xFFFFu; i = i + 1u) {
+                               /**
+                               * Twelve rounds of mixing as part of compression step
+                               */
                                for (var r: u32 = 0u; r < 12u; r = r + 1u) {
-                                               G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
-                                               G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
-                                               G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
-                                               G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
-                                               G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
-                                               G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
-                                               G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
-                                               G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
-                       }
+                                       G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
+                                       G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
+                                       G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
+                                       G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
+                                       G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
+                                       G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
+                                       G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
+                                       G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
+                               }
 
-                       /**
-                       * Set nonce if it passes the threshold and no other thread has set it
-                       */
-                       if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
-                               atomicStore(&work.found, 1u);
-                               work.nonce.x = m[0];
-                               work.nonce.y = m[1];
-                               return;
+                               /**
+                               * Set nonce if it passes the threshold and no other thread has set it
+                               */
+                               if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
+                                       atomicStore(&work.found, 1u);
+                                       work.nonce.x = m[0];
+                                       work.nonce.y = m[1];
+                                       return;
+                               }
+
+                               /**
+                               * If not found, move on to the next nonce candidate.
+                               * For example, increment by the total number of *active* threads,
+                               * or simply increment by 1.
+                               */
+                               m[0u] = m[0u] + 1u;
                        }
 
-                       /**
-                       * Nonce not found in this execution context
-                       */
+                       // No valid nonce found by this thread in 'maxIterations' attempts
+                       // Exit. Another dispatch or CPU fallback might handle continuing.
                        return;
                }
        `;