From: Chris Duncan Date: Fri, 10 Jan 2025 14:53:10 +0000 (-0800) Subject: Test how max workgroup size with single dimension dispatch affects mobile because... X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=e74509e083dfe57ff3e6c758341ee0191916bbb3;p=libnemo.git Test how max workgroup size with single dimension dispatch affects mobile because it does pretty well for desktop as seen by the newest benchmark. --- diff --git a/benchmarks.md b/benchmarks.md index e78b851..da54b21 100644 --- a/benchmarks.md +++ b/benchmarks.md @@ -128,6 +128,24 @@ Geometric: 171.8797089689105 ms Minimum: 20.80000001192093 ms Maximum: 2093.199999988079 ms +libnemo: Time to calculate proof-of-work for a send block 512 times +(after increasing workgroup_size to 256 from 64 and decreasing dispatch size) +Total: 149857.0999999717 ms +Average: 292.6896484374447 ms +Harmonic: 73.49751645489904 ms +Geometric: 174.9560632035056 ms +Minimum: 2.600000001490116 ms +Maximum: 2364.5999999940395 ms + +GLOBALS.mjs:46 PASS libnemo: Time to calculate proof-of-work for a send block 512 times +(after increasing workgroup_size to 256 from 64 and decreasing dispatch size) +Total: 145201.70000004023 ms +Average: 283.5970703125786 ms +Harmonic: 54.2894989554052 ms +Geometric: 155.58659283933008 ms +Minimum: 2.5 ms +Maximum: 1812.3999999985099 ms + PowGpu: Time to calculate proof-of-work for a send block 32 times diff --git a/global.min.js b/global.min.0.js similarity index 99% rename from global.min.js rename to global.min.0.js index 1d9336f..50114cc 100644 --- a/global.min.js +++ b/global.min.0.js @@ -3265,28 +3265,17 @@ const BLAKE2B_IV32_1: u32 = 0x6A09E667u; * 8-byte work is split into two 4-byte u32. Low 4 bytes are random u32 from * UBO. High 4 bytes are the random value XOR'd with index of each thread. */ -@compute @workgroup_size(64) -fn main( - @builtin(workgroup_id) workgroup_id: vec3, - @builtin(local_invocation_id) local_id: vec3 -) { +@compute @workgroup_size(256) +fn main(@builtin(global_invocation_id) global_id: vec3) { if (atomicLoad(&work.found) != 0u) { return; } let threshold: u32 = ubo.threshold; - /** - * Flatten 3D workgroup and local identifiers into u32 for each thread - */ - var id: u32 = ((workgroup_id.x & 0xFFu) << 24u) | - ((workgroup_id.y & 0xFFu) << 16u) | - ((workgroup_id.z & 0xFFu) << 8u) | - (local_id.x & 0xFFu); - /** * Initialize (nonce||blockhash) concatenation */ - var m0: u32 = ubo.random; - var m1: u32 = ubo.random ^ id; + var m0: u32 = ubo.random ^ global_id.x; + var m1: u32 = ubo.random ^ global_id.y; var m2: u32 = ubo.blockhash[0u].x; var m3: u32 = ubo.blockhash[0u].y; var m4: u32 = ubo.blockhash[0u].z; @@ -11513,7 +11502,7 @@ var init_gpu = __esm({ const passEncoder = commandEncoder.beginComputePass(); passEncoder.setPipeline(this.#pipeline); passEncoder.setBindGroup(0, bindGroup); - passEncoder.dispatchWorkgroups(256, 256, 256); + passEncoder.dispatchWorkgroups(255, 255); passEncoder.end(); commandEncoder.copyBufferToBuffer(this.#gpuBuffer, 0, this.#cpuBuffer, 0, 12); this.#device.queue.submit([commandEncoder.finish()]); diff --git a/index.html b/index.html index c9d4dbe..ec4d5c0 100644 --- a/index.html +++ b/index.html @@ -3,7 +3,7 @@ + src="https://zoso.dev/?p=libnemo.git;a=blob_plain;f=global.min.0.js;hb=refs/heads/threads">