From 4e5a6a5365e27beb4bbc798643ba66a7a14b5a10 Mon Sep 17 00:00:00 2001
From: Chris Duncan <chris@zoso.dev>
Date: Wed, 1 Jan 2025 02:08:33 -0800
Subject: [PATCH] Clarified limitations of workgroup dispatching and sizes and
 corrected how workload is determined and passed into the shader, including
 bytes available for each pass (hint: we will always need to pass in at least
 some crypto bytes due to sheer size of computation, i.e. 256x256 is 65k
 nonces but only 2 bytes of 8 required). Start filling out actual hashing
 function.

---
 src/lib/workers/powgpu.ts | 78 ++++++++++++++++-----------------------
 1 file changed, 32 insertions(+), 46 deletions(-)
diff --git a/src/lib/workers/powgpu.ts b/src/lib/workers/powgpu.ts
index 81bdc57..7897f26 100644
--- a/src/lib/workers/powgpu.ts
+++ b/src/lib/workers/powgpu.ts
@@ -4,7 +4,6 @@
 // https://github.com/numtel/nano-webgl-pow
 /// <reference types="@webgpu/types" />
 import { WorkerInterface } from '../pool.js'
-import powgl from './powgl.js'
 
 export class PowGpu extends WorkerInterface {
 	static {
@@ -42,19 +41,15 @@ export class PowGpu extends WorkerInterface {
 		})
 	}
 
-	static #workDispatchSize: number = Math.max(1, Math.floor(navigator.hardwareConcurrency))
-	static #workgroupSize: number = 256 // Must align with shader
-	static #workload: number = this.#workDispatchSize * this.#workgroupSize
-
 	// WebGPU Compute Shader
 	static #shader = `
 		struct UBO {
 			blockhash: array<vec4<u32>, 2>,
-			threshold: u32,
-			workload: u32
+			rand: vec2<u32>,
+			threshold: u32
 		};
 		@group(0) @binding(0) var<uniform> ubo: UBO;
-		@group(0) @binding(1) var<storage, read_write> work: array<vec2<u32>>;
+		@group(0) @binding(1) var<storage, read_write> work: u32;
 
 		/**
 		* Defined separately from uint v[32] below as the original value is required
@@ -142,18 +137,18 @@ export class PowGpu extends WorkerInterface {
 		/**
 		* Main compute function
 		*/
-		@compute @workgroup_size(${this.#workgroupSize}, ${this.#workgroupSize})
-		fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-			// Check bounds, may be unnecessary with proper dispatch size
-			if (global_id.x >= ubo.workload || global_id.y >= ubo.workload) {
-				return;
-			}
-
+		@compute @workgroup_size(256)
+		fn main(
+			@builtin(global_invocation_id) global_id: vec3<u32>,
+			@builtin(workgroup_id) workgroup_id: vec3<u32>,
+			@builtin(local_invocation_id) local_id: vec3<u32>
+		) {
 			var m: array<u32, 16>;
 
-			// Workgroup index defines work value for this calculation point
-			m[0u] = global_id.x;
-			m[1u] = global_id.y;
+			// 8-byte work is split into two 4-byte u32
+			// First 3 bytes provided, last byte defined by this compute index
+			m[0u] = (ubo.rand.x << 8u) ^ workgroup_id.x;
+			m[1u] = (ubo.rand.y << 8u) ^ local_id.x;
 
 			// Block hash
 			m[2u] = ubo.blockhash[0u].x;
@@ -200,10 +195,9 @@ export class PowGpu extends WorkerInterface {
 			}
 
 			// Store the result directly into work array
-			// if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > 0) {
-				work[global_id.x].x = 1u ^ (2u << 8u);
-				work[global_id.x].y = 3u ^ (4u << 8u);
-			// }
+			if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > 0u) {
+				work = global_id.x;
+			}
 		}
 	`;
 
@@ -239,11 +233,11 @@ export class PowGpu extends WorkerInterface {
 
 						// Create buffers for writing GPU calculations and reading from Javascript
 						this.#gpuBuffer = this.#device.createBuffer({
-							size: this.#workload,
+							size: 16,
 							usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST | GPUBufferUsage.COPY_SRC
 						})
 						this.#cpuBuffer = this.#device.createBuffer({
-							size: this.#workload,
+							size: 16,
 							usage: GPUBufferUsage.COPY_DST | GPUBufferUsage.MAP_READ
 						})
 
@@ -300,8 +294,10 @@ export class PowGpu extends WorkerInterface {
 			const uint32 = hashHex.slice(i, i + 8)
 			uboView.setUint32(i / 2, parseInt(uint32, 16))
 		}
-		uboView.setUint32(32, threshold, true)
-		uboView.setUint32(40, PowGpu.#workload, true)
+		const rand = crypto.getRandomValues(new Uint32Array(2))
+		uboView.setUint32(32, rand[0], true)
+		uboView.setUint32(36, rand[1], true)
+		uboView.setUint32(40, threshold, true)
 		const uboBuffer = PowGpu.#device.createBuffer({
 			size: uboView.byteLength,
 			usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
@@ -334,7 +330,8 @@ export class PowGpu extends WorkerInterface {
 		// Issue commands and end render pass
 		passEncoder.setPipeline(PowGpu.#pipeline)
 		passEncoder.setBindGroup(0, bindGroup)
-		passEncoder.dispatchWorkgroups(PowGpu.#workDispatchSize)
+		// passEncoder.dispatchWorkgroups(256 * Math.max(1, navigator.hardwareConcurrency))
+		passEncoder.dispatchWorkgroups(256)
 		passEncoder.end()
 
 		// Copy result from GPU buffer to CPU buffer
@@ -343,7 +340,7 @@ export class PowGpu extends WorkerInterface {
 			0,
 			PowGpu.#cpuBuffer,
 			0,
-			PowGpu.#workload
+			16
 		)
 
 		// End frame by passing array of command buffers to command queue for execution
@@ -351,29 +348,18 @@ export class PowGpu extends WorkerInterface {
 
 		// Read results back to Javascript and then unmap buffer after reading
 		await PowGpu.#cpuBuffer.mapAsync(GPUMapMode.READ)
-		const result = new Uint32Array(PowGpu.#cpuBuffer.getMappedRange())
+		await PowGpu.#device.queue.onSubmittedWorkDone()
+		const result = new Uint32Array(PowGpu.#cpuBuffer.getMappedRange()).slice()
 		PowGpu.#cpuBuffer.unmap()
 
 		console.log(`result`)
 		console.dir(result)
-		console.log(`result?`)
-		for (let i = 0; i < result.length; i += 2) {
-			const work = new Uint8Array([result[i], result[i + 1]])
-			console.log(`result[${i}]: ${result[i]}`)
-			console.log(`result[${i + 1}]: ${result[i + 1]}`)
-			console.log(`work: ${work}`)
-			if (result[i] !== 0 || result[i + 1] !== 0) {
-				// const hex = PowGpu.#hexify(work.subarray(4, 8)) + PowGpu.#hexify([
-				// 	result[i + 2],
-				// 	result[i + 3],
-				// 	work[2] ^ (result[i] - 1),
-				// 	work[3] ^ (result[i + 1] - 1)
-				// ])
-				const hex = PowGpu.#hexify(work)
-				typeof callback === 'function' && callback(hex)
-				return
-			}
+		if (result[0] !== 0 || result[1] !== 0) {
+			const hex = PowGpu.#hexify([...result])
+			typeof callback === 'function' && callback(hex)
+			return
 		}
+
 		// No result found. Redraw
 		requestAnimationFrame(() => this.#calculate(hashHex, callback, threshold))
 	}
-- 
2.34.1