From: Chris Duncan <chris@zoso.dev>
Date: Fri, 10 Jan 2025 14:53:10 +0000 (-0800)
Subject: Test how max workgroup size with single dimension dispatch affects mobile because... 
X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=e74509e083dfe57ff3e6c758341ee0191916bbb3;p=libnemo.git

Test how max workgroup size with single dimension dispatch affects mobile because it does pretty well for desktop as seen by the newest benchmark.
---

diff --git a/benchmarks.md b/benchmarks.md
index e78b851..da54b21 100644
--- a/benchmarks.md
+++ b/benchmarks.md
@@ -128,6 +128,24 @@ Geometric: 171.8797089689105 ms
 Minimum: 20.80000001192093 ms
 Maximum: 2093.199999988079 ms
 
+libnemo: Time to calculate proof-of-work for a send block 512 times
+(after increasing workgroup_size to 256 from 64 and decreasing dispatch size)
+Total: 149857.0999999717 ms
+Average: 292.6896484374447 ms
+Harmonic: 73.49751645489904 ms
+Geometric: 174.9560632035056 ms
+Minimum: 2.600000001490116 ms
+Maximum: 2364.5999999940395 ms
+
+GLOBALS.mjs:46 PASS  libnemo: Time to calculate proof-of-work for a send block 512 times
+(after increasing workgroup_size to 256 from 64 and decreasing dispatch size)
+Total: 145201.70000004023 ms
+Average: 283.5970703125786 ms
+Harmonic: 54.2894989554052 ms
+Geometric: 155.58659283933008 ms
+Minimum: 2.5 ms
+Maximum: 1812.3999999985099 ms
+
 
 
 PowGpu: Time to calculate proof-of-work for a send block 32 times
diff --git a/global.min.js b/global.min.0.js
similarity index 99%
rename from global.min.js
rename to global.min.0.js
index 1d9336f..50114cc 100644
--- a/global.min.js
+++ b/global.min.0.js
@@ -3265,28 +3265,17 @@ const BLAKE2B_IV32_1: u32 = 0x6A09E667u;
 * 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from
 * UBO. High 4 bytes are the random value XOR'd with index of each thread.
 */
-@compute @workgroup_size(64)
-fn main(
-	@builtin(workgroup_id) workgroup_id: vec3<u32>,
-	@builtin(local_invocation_id) local_id: vec3<u32>
-) {
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
 	if (atomicLoad(&work.found) != 0u) { return; }
 
 	let threshold: u32 = ubo.threshold;
 
-	/**
-	* Flatten 3D workgroup and local identifiers into u32 for each thread
-	*/
-	var id: u32 = ((workgroup_id.x & 0xFFu) << 24u) |
-		((workgroup_id.y & 0xFFu) << 16u) |
-		((workgroup_id.z & 0xFFu) << 8u) |
-		(local_id.x & 0xFFu);
-
 	/**
 	* Initialize (nonce||blockhash) concatenation
 	*/
-	var m0: u32 = ubo.random;
-	var m1: u32 = ubo.random ^ id;
+	var m0: u32 = ubo.random ^ global_id.x;
+	var m1: u32 = ubo.random ^ global_id.y;
 	var m2: u32 = ubo.blockhash[0u].x;
 	var m3: u32 = ubo.blockhash[0u].y;
 	var m4: u32 = ubo.blockhash[0u].z;
@@ -11513,7 +11502,7 @@ var init_gpu = __esm({
           const passEncoder = commandEncoder.beginComputePass();
           passEncoder.setPipeline(this.#pipeline);
           passEncoder.setBindGroup(0, bindGroup);
-          passEncoder.dispatchWorkgroups(256, 256, 256);
+          passEncoder.dispatchWorkgroups(255, 255);
           passEncoder.end();
           commandEncoder.copyBufferToBuffer(this.#gpuBuffer, 0, this.#cpuBuffer, 0, 12);
           this.#device.queue.submit([commandEncoder.finish()]);
diff --git a/index.html b/index.html
index c9d4dbe..ec4d5c0 100644
--- a/index.html
+++ b/index.html
@@ -3,7 +3,7 @@
 <head>
 	<link rel="icon" href="./favicon.ico">
 	<script type="module"
-		src="https://zoso.dev/?p=libnemo.git;a=blob_plain;f=global.min.js;hb=refs/heads/ios"></script>
+		src="https://zoso.dev/?p=libnemo.git;a=blob_plain;f=global.min.0.js;hb=refs/heads/threads"></script>
 	<!-- <script type="module" src="https://cdn.jsdelivr.net/npm/nano-webgl-pow@1.1.1/nano-webgl-pow.js"></script> -->
 	<script type="module">
 		(async () => {
diff --git a/src/lib/nano-pow/classes/gpu.ts b/src/lib/nano-pow/classes/gpu.ts
index 89cb9a0..a09a11c 100644
--- a/src/lib/nano-pow/classes/gpu.ts
+++ b/src/lib/nano-pow/classes/gpu.ts
@@ -173,7 +173,7 @@ export class NanoPowGpu {
 			// Issue commands and end compute pass structure
 			passEncoder.setPipeline(this.#pipeline)
 			passEncoder.setBindGroup(0, bindGroup)
-			passEncoder.dispatchWorkgroups(256, 256, 256)
+			passEncoder.dispatchWorkgroups(0xff, 0xff)
 			passEncoder.end()
 
 			// Copy 8-byte nonce and 4-byte found flag from GPU to CPU for reading
diff --git a/src/lib/nano-pow/shaders/gpu-compute.ts b/src/lib/nano-pow/shaders/gpu-compute.ts
index 784560b..2e7c140 100644
--- a/src/lib/nano-pow/shaders/gpu-compute.ts
+++ b/src/lib/nano-pow/shaders/gpu-compute.ts
@@ -23,28 +23,17 @@ const BLAKE2B_IV32_1: u32 = 0x6A09E667u;
 * 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from
 * UBO. High 4 bytes are the random value XOR'd with index of each thread.
 */
-@compute @workgroup_size(64)
-fn main(
-	@builtin(workgroup_id) workgroup_id: vec3<u32>,
-	@builtin(local_invocation_id) local_id: vec3<u32>
-) {
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) global_id: vec3<u32>) {
 	if (atomicLoad(&work.found) != 0u) { return; }
 
 	let threshold: u32 = ubo.threshold;
 
-	/**
-	* Flatten 3D workgroup and local identifiers into u32 for each thread
-	*/
-	var id: u32 = ((workgroup_id.x & 0xFFu) << 24u) |
-		((workgroup_id.y & 0xFFu) << 16u) |
-		((workgroup_id.z & 0xFFu) << 8u) |
-		(local_id.x & 0xFFu);
-
 	/**
 	* Initialize (nonce||blockhash) concatenation
 	*/
-	var m0: u32 = ubo.random;
-	var m1: u32 = ubo.random ^ id;
+	var m0: u32 = ubo.random ^ global_id.x;
+	var m1: u32 = ubo.random ^ global_id.y;
 	var m2: u32 = ubo.blockhash[0u].x;
 	var m3: u32 = ubo.blockhash[0u].y;
 	var m4: u32 = ubo.blockhash[0u].z;
diff --git a/test/perf.block.js b/test/perf.block.js
index 3d43203..73f8b91 100644
--- a/test/perf.block.js
+++ b/test/perf.block.js
@@ -9,7 +9,7 @@ import { PowGl, NanoPowGpu, SendBlock } from '#dist/main.js'
 import 'nano-webgl-pow'
 
 await suite('Block performance', async () => {
-	const COUNT = 0x20
+	const COUNT = 0x200
 
 	await test(`PowGpu: Calculate proof-of-work for 6 unique block hashes`, async () => {
 		const times = []