From: Chris Duncan Date: Thu, 2 Jan 2025 13:55:02 +0000 (-0800) Subject: Start experimenting with 4-byte assignment using workgroup size as 1 byte via local_i... X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=86e3a1e8cd0831786ea09ee084067e4f6b25d6f9;p=libnemo.git Start experimenting with 4-byte assignment using workgroup size as 1 byte via local_invocation_id and then 3 dimensions in dispatch as 3 more bytes via workgroup_id.x/y/z, all concatenated into global_invocation_id. Looping in compute shaders is also OK versus fragment shaders, so iterate within the shader instead of reading out every frame with CPU. --- diff --git a/src/lib/workers/powgpu.ts b/src/lib/workers/powgpu.ts index b27f31c..7d7774d 100644 --- a/src/lib/workers/powgpu.ts +++ b/src/lib/workers/powgpu.ts @@ -142,11 +142,13 @@ export class PowGpu extends WorkerInterface { */ @compute @workgroup_size(256) fn main(@builtin(global_invocation_id) global_id: vec3) { - var m: array; - - m[0u] = ubo.rand.x; - m[1u] = (ubo.rand.y << 16u) ^ global_id.x; + work.x = 0u; + work.y = 0u; + work.z = 0u; + var m: array; + m[0u] = 0u; + m[1u] = global_id.x; m[2u] = ubo.blockhash[0u].x; m[3u] = ubo.blockhash[0u].y; m[4u] = ubo.blockhash[0u].z; @@ -156,44 +158,51 @@ export class PowGpu extends WorkerInterface { m[8u] = ubo.blockhash[1u].z; m[9u] = ubo.blockhash[1u].w; - /** - * Compression buffer, intialized to 2 instances of the initialization vector - * The following values have been modified from the BLAKE2B_IV: - * OUTLEN is constant 8 bytes - * v[0] ^= 0x01010000u ^ uint(OUTLEN); - * INLEN is constant 40 bytes: work value (8) + block hash (32) - * v[24] ^= uint(INLEN); - * It's always the "last" compression at this INLEN - * v[28] = ~v[28]; - * v[29] = ~v[29]; - */ - var v = array( - 0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u, - 0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au, - 0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu, - 0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u, - 0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u, - 0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au, - 0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu, - 0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u - ); - - // twelve rounds of mixing - for (var i: u32 = 0u; i < 12u; i = i + 1u) { - B2B_G(&v, &m, 0, 8, 16, 24, SIGMA82[i * 16 + 0], SIGMA82[i * 16 + 1]); - B2B_G(&v, &m, 2, 10, 18, 26, SIGMA82[i * 16 + 2], SIGMA82[i * 16 + 3]); - B2B_G(&v, &m, 4, 12, 20, 28, SIGMA82[i * 16 + 4], SIGMA82[i * 16 + 5]); - B2B_G(&v, &m, 6, 14, 22, 30, SIGMA82[i * 16 + 6], SIGMA82[i * 16 + 7]); - B2B_G(&v, &m, 0, 10, 20, 30, SIGMA82[i * 16 + 8], SIGMA82[i * 16 + 9]); - B2B_G(&v, &m, 2, 12, 22, 24, SIGMA82[i * 16 + 10], SIGMA82[i * 16 + 11]); - B2B_G(&v, &m, 4, 14, 16, 26, SIGMA82[i * 16 + 12], SIGMA82[i * 16 + 13]); - B2B_G(&v, &m, 6, 8, 18, 28, SIGMA82[i * 16 + 14], SIGMA82[i * 16 + 15]); - } + var i: u32 = 0u; + while (work.x == 0u) { + m[0u] = i; + i = i + 1u; + + /** + * Compression buffer, intialized to 2 instances of the initialization vector + * The following values have been modified from the BLAKE2B_IV: + * OUTLEN is constant 8 bytes + * v[0] ^= 0x01010000u ^ uint(OUTLEN); + * INLEN is constant 40 bytes: work value (8) + block hash (32) + * v[24] ^= uint(INLEN); + * It's always the "last" compression at this INLEN + * v[28] = ~v[28]; + * v[29] = ~v[29]; + */ + var v = array( + 0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u, + 0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au, + 0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu, + 0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u, + 0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u, + 0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au, + 0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu, + 0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u + ); + + // twelve rounds of mixing + for (var i: u32 = 0u; i < 12u; i = i + 1u) { + B2B_G(&v, &m, 0, 8, 16, 24, SIGMA82[i * 16 + 0], SIGMA82[i * 16 + 1]); + B2B_G(&v, &m, 2, 10, 18, 26, SIGMA82[i * 16 + 2], SIGMA82[i * 16 + 3]); + B2B_G(&v, &m, 4, 12, 20, 28, SIGMA82[i * 16 + 4], SIGMA82[i * 16 + 5]); + B2B_G(&v, &m, 6, 14, 22, 30, SIGMA82[i * 16 + 6], SIGMA82[i * 16 + 7]); + B2B_G(&v, &m, 0, 10, 20, 30, SIGMA82[i * 16 + 8], SIGMA82[i * 16 + 9]); + B2B_G(&v, &m, 2, 12, 22, 24, SIGMA82[i * 16 + 10], SIGMA82[i * 16 + 11]); + B2B_G(&v, &m, 4, 14, 16, 26, SIGMA82[i * 16 + 12], SIGMA82[i * 16 + 13]); + B2B_G(&v, &m, 6, 8, 18, 28, SIGMA82[i * 16 + 14], SIGMA82[i * 16 + 15]); + } - // Store the result directly into work array - if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > 0xffffffffu) { - work.x = ubo.rand.x; - work.y = (ubo.rand.y << 16u) ^ global_id.x; + // Store the result directly into work array + if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > ubo.threshold) { + work.x = 1u; + work.y = m[0u]; + work.z = m[1u]; + } } } `; @@ -328,8 +337,7 @@ export class PowGpu extends WorkerInterface { // Issue commands and end render pass passEncoder.setPipeline(PowGpu.#pipeline) passEncoder.setBindGroup(0, bindGroup) - // passEncoder.dispatchWorkgroups(256 * Math.max(1, navigator.hardwareConcurrency)) - passEncoder.dispatchWorkgroups(256) + passEncoder.dispatchWorkgroups(256, 256, 256) passEncoder.end() // Copy result from GPU buffer to CPU buffer @@ -350,9 +358,8 @@ export class PowGpu extends WorkerInterface { const result = new Uint32Array(PowGpu.#cpuBuffer.getMappedRange()).slice() PowGpu.#cpuBuffer.unmap() - console.log(`result: ${[...result]}`) - if (result[0] !== 0 || result[1] !== 0) { - const hex = PowGpu.#hexify([result[0], result[1]]) + if (result[0] !== 0) { + const hex = PowGpu.#hexify([result[1], result[2]]) typeof callback === 'function' && callback(hex) return }