From: Chris Duncan <chris@zoso.dev>
Date: Thu, 2 Jan 2025 13:55:02 +0000 (-0800)
Subject: Start experimenting with 4-byte assignment using workgroup size as 1 byte via local_i... 
X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=86e3a1e8cd0831786ea09ee084067e4f6b25d6f9;p=libnemo.git

Start experimenting with 4-byte assignment using workgroup size as 1 byte via local_invocation_id and then 3 dimensions in dispatch as 3 more bytes via workgroup_id.x/y/z, all concatenated into global_invocation_id. Looping in compute shaders is also OK versus fragment shaders, so iterate within the shader instead of reading out every frame with CPU.
---

diff --git a/src/lib/workers/powgpu.ts b/src/lib/workers/powgpu.ts
index b27f31c..7d7774d 100644
--- a/src/lib/workers/powgpu.ts
+++ b/src/lib/workers/powgpu.ts
@@ -142,11 +142,13 @@ export class PowGpu extends WorkerInterface {
 		*/
 		@compute @workgroup_size(256)
 		fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
-			var m: array<u32, 16>;
-
-			m[0u] = ubo.rand.x;
-			m[1u] = (ubo.rand.y << 16u) ^ global_id.x;
+			work.x = 0u;
+			work.y = 0u;
+			work.z = 0u;
 
+			var m: array<u32, 16>;
+			m[0u] = 0u;
+			m[1u] = global_id.x;
 			m[2u] = ubo.blockhash[0u].x;
 			m[3u] = ubo.blockhash[0u].y;
 			m[4u] = ubo.blockhash[0u].z;
@@ -156,44 +158,51 @@ export class PowGpu extends WorkerInterface {
 			m[8u] = ubo.blockhash[1u].z;
 			m[9u] = ubo.blockhash[1u].w;
 
-			/**
-			* Compression buffer, intialized to 2 instances of the initialization vector
-			* The following values have been modified from the BLAKE2B_IV:
-			* OUTLEN is constant 8 bytes
-			* v[0] ^= 0x01010000u ^ uint(OUTLEN);
-			* INLEN is constant 40 bytes: work value (8) + block hash (32)
-			* v[24] ^= uint(INLEN);
-			* It's always the "last" compression at this INLEN
-			* v[28] = ~v[28];
-			* v[29] = ~v[29];
-			*/
-			var v = array<u32, 32>(
-				0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
-				0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
-				0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
-				0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u,
-				0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
-				0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
-				0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
-				0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
-			);
-
-			// twelve rounds of mixing
-			for (var i: u32 = 0u; i < 12u; i = i + 1u) {
-				B2B_G(&v, &m, 0, 8, 16, 24, SIGMA82[i * 16 + 0], SIGMA82[i * 16 + 1]);
-				B2B_G(&v, &m, 2, 10, 18, 26, SIGMA82[i * 16 + 2], SIGMA82[i * 16 + 3]);
-				B2B_G(&v, &m, 4, 12, 20, 28, SIGMA82[i * 16 + 4], SIGMA82[i * 16 + 5]);
-				B2B_G(&v, &m, 6, 14, 22, 30, SIGMA82[i * 16 + 6], SIGMA82[i * 16 + 7]);
-				B2B_G(&v, &m, 0, 10, 20, 30, SIGMA82[i * 16 + 8], SIGMA82[i * 16 + 9]);
-				B2B_G(&v, &m, 2, 12, 22, 24, SIGMA82[i * 16 + 10], SIGMA82[i * 16 + 11]);
-				B2B_G(&v, &m, 4, 14, 16, 26, SIGMA82[i * 16 + 12], SIGMA82[i * 16 + 13]);
-				B2B_G(&v, &m, 6, 8, 18, 28, SIGMA82[i * 16 + 14], SIGMA82[i * 16 + 15]);
-			}
+			var i: u32 = 0u;
+			while (work.x == 0u) {
+				m[0u] = i;
+				i = i + 1u;
+
+				/**
+				* Compression buffer, intialized to 2 instances of the initialization vector
+				* The following values have been modified from the BLAKE2B_IV:
+				* OUTLEN is constant 8 bytes
+				* v[0] ^= 0x01010000u ^ uint(OUTLEN);
+				* INLEN is constant 40 bytes: work value (8) + block hash (32)
+				* v[24] ^= uint(INLEN);
+				* It's always the "last" compression at this INLEN
+				* v[28] = ~v[28];
+				* v[29] = ~v[29];
+				*/
+				var v = array<u32, 32>(
+					0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
+					0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
+					0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
+					0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u,
+					0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
+					0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
+					0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
+					0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
+				);
+
+				// twelve rounds of mixing
+				for (var i: u32 = 0u; i < 12u; i = i + 1u) {
+					B2B_G(&v, &m, 0, 8, 16, 24, SIGMA82[i * 16 + 0], SIGMA82[i * 16 + 1]);
+					B2B_G(&v, &m, 2, 10, 18, 26, SIGMA82[i * 16 + 2], SIGMA82[i * 16 + 3]);
+					B2B_G(&v, &m, 4, 12, 20, 28, SIGMA82[i * 16 + 4], SIGMA82[i * 16 + 5]);
+					B2B_G(&v, &m, 6, 14, 22, 30, SIGMA82[i * 16 + 6], SIGMA82[i * 16 + 7]);
+					B2B_G(&v, &m, 0, 10, 20, 30, SIGMA82[i * 16 + 8], SIGMA82[i * 16 + 9]);
+					B2B_G(&v, &m, 2, 12, 22, 24, SIGMA82[i * 16 + 10], SIGMA82[i * 16 + 11]);
+					B2B_G(&v, &m, 4, 14, 16, 26, SIGMA82[i * 16 + 12], SIGMA82[i * 16 + 13]);
+					B2B_G(&v, &m, 6, 8, 18, 28, SIGMA82[i * 16 + 14], SIGMA82[i * 16 + 15]);
+				}
 
-			// Store the result directly into work array
-			if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > 0xffffffffu) {
-				work.x = ubo.rand.x;
-				work.y = (ubo.rand.y << 16u) ^ global_id.x;
+				// Store the result directly into work array
+				if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) {
+					work.x = 1u;
+					work.y = m[0u];
+					work.z = m[1u];
+				}
 			}
 		}
 	`;
@@ -328,8 +337,7 @@ export class PowGpu extends WorkerInterface {
 		// Issue commands and end render pass
 		passEncoder.setPipeline(PowGpu.#pipeline)
 		passEncoder.setBindGroup(0, bindGroup)
-		// passEncoder.dispatchWorkgroups(256 * Math.max(1, navigator.hardwareConcurrency))
-		passEncoder.dispatchWorkgroups(256)
+		passEncoder.dispatchWorkgroups(256, 256, 256)
 		passEncoder.end()
 
 		// Copy result from GPU buffer to CPU buffer
@@ -350,9 +358,8 @@ export class PowGpu extends WorkerInterface {
 		const result = new Uint32Array(PowGpu.#cpuBuffer.getMappedRange()).slice()
 		PowGpu.#cpuBuffer.unmap()
 
-		console.log(`result: ${[...result]}`)
-		if (result[0] !== 0 || result[1] !== 0) {
-			const hex = PowGpu.#hexify([result[0], result[1]])
+		if (result[0] !== 0) {
+			const hex = PowGpu.#hexify([result[1], result[2]])
 			typeof callback === 'function' && callback(hex)
 			return
 		}