From 822c24e112da41522421f8b91e2e5ae92fb98622 Mon Sep 17 00:00:00 2001
From: Chris Duncan <chris@zoso.dev>
Date: Sun, 5 Jan 2025 00:00:31 -0800
Subject: [PATCH] Scrap in-shader loop since it is probably triggering browser
 watchdog timers as a long-running process on lower-end hardware, just
 redispatch instead.

---
 src/lib/workers/powgpu.ts | 60 +++++++++++++++++++--------------------
 1 file changed, 29 insertions(+), 31 deletions(-)
diff --git a/src/lib/workers/powgpu.ts b/src/lib/workers/powgpu.ts
index 7c5b826..920e090 100644
--- a/src/lib/workers/powgpu.ts
+++ b/src/lib/workers/powgpu.ts
@@ -53,8 +53,8 @@ export class PowGpu extends WorkerInterface {
 		@group(0) @binding(0) var<uniform> ubo: UBO;
 
 		struct WORK {
-			nonce: vec2<u32>,
-			found: atomic<u32>
+			found: atomic<u32>,
+			nonce: vec2<u32>
 		};
 		@group(0) @binding(1) var<storage, read_write> work: WORK;
 
@@ -145,7 +145,7 @@ export class PowGpu extends WorkerInterface {
 		* 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from
 		* UBO. High 4 bytes are the random value XOR'd with index of each thread.
 		*/
-		@compute @workgroup_size(64)
+		@compute @workgroup_size(256)
 		fn main(
 			@builtin(workgroup_id) workgroup_id: vec3<u32>,
 			@builtin(local_invocation_id) local_id: vec3<u32>
@@ -197,32 +197,30 @@ export class PowGpu extends WorkerInterface {
 				0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
 			);
 
-			for (var i: u32 = 0u; i < 0xFFu; i = i << 1u) {
-				m[0u] = m[0u] ^ i;
-				/**
-				* Twelve rounds of mixing as part of BLAKE2b compression step
-				*/
-				for (var r: u32 = 0u; r < 12u; r = r + 1u) {
-					G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
-					G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
-					G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
-					G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
-					G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
-					G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
-					G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
-					G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
-				}
+			/**
+			* Twelve rounds of mixing as part of BLAKE2b compression step
+			*/
+			for (var r: u32 = 0u; r < 12u; r = r + 1u) {
+				G(&v, &m, 0u, 8u, 16u, 24u, SIGMA82[r * 16u + 0u], SIGMA82[r * 16u + 1u]);
+				G(&v, &m, 2u, 10u, 18u, 26u, SIGMA82[r * 16u + 2u], SIGMA82[r * 16u + 3u]);
+				G(&v, &m, 4u, 12u, 20u, 28u, SIGMA82[r * 16u + 4u], SIGMA82[r * 16u + 5u]);
+				G(&v, &m, 6u, 14u, 22u, 30u, SIGMA82[r * 16u + 6u], SIGMA82[r * 16u + 7u]);
+				G(&v, &m, 0u, 10u, 20u, 30u, SIGMA82[r * 16u + 8u], SIGMA82[r * 16u + 9u]);
+				G(&v, &m, 2u, 12u, 22u, 24u, SIGMA82[r * 16u + 10u], SIGMA82[r * 16u + 11u]);
+				G(&v, &m, 4u, 14u, 16u, 26u, SIGMA82[r * 16u + 12u], SIGMA82[r * 16u + 13u]);
+				G(&v, &m, 6u, 8u, 18u, 28u, SIGMA82[r * 16u + 14u], SIGMA82[r * 16u + 15u]);
+			}
 
-				/**
-				* Set nonce if it passes the threshold and no other thread has set it
-				*/
-				if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
-					atomicStore(&work.found, 1u);
-					work.nonce.x = m[0];
-					work.nonce.y = m[1];
-					return;
-				}
+			/**
+			* Set nonce if it passes the threshold and no other thread has set it
+			*/
+			if (atomicLoad(&work.found) == 0u && (BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
+				atomicStore(&work.found, 1u);
+				work.nonce.x = m[0];
+				work.nonce.y = m[1];
+				return;
 			}
+
 			/**
 			* Nonce not found in this execution context
 			*/
@@ -327,8 +325,8 @@ export class PowGpu extends WorkerInterface {
 		uboView.setUint32(36, threshold, true)
 		PowGpu.#device.queue.writeBuffer(PowGpu.#uboBuffer, 0, uboView)
 
-		// Reset offset 8 `found` flag to 0u in WORK before each calculation
-		PowGpu.#device.queue.writeBuffer(PowGpu.#gpuBuffer, 8, new Uint32Array([0]))
+		// Reset `found` flag to 0u in WORK before each calculation
+		PowGpu.#device.queue.writeBuffer(PowGpu.#gpuBuffer, 0, new Uint32Array([0]))
 
 		// Bind UBO read and GPU write buffers
 		const bindGroup = PowGpu.#device.createBindGroup({
@@ -375,8 +373,8 @@ export class PowGpu extends WorkerInterface {
 		await PowGpu.#cpuBuffer.mapAsync(GPUMapMode.READ)
 		await PowGpu.#device.queue.onSubmittedWorkDone()
 		const data = new DataView(PowGpu.#cpuBuffer.getMappedRange())
-		const nonce = data.getBigUint64(0, true)
-		const found = !!data.getUint32(8)
+		const found = !!data.getUint32(0)
+		const nonce = data.getBigUint64(8, true)
 		PowGpu.#cpuBuffer.unmap()
 
 		if (found) {
-- 
2.34.1