From 3b1462830b94d2ba66b3fd8eaf25a945fd8c6a52 Mon Sep 17 00:00:00 2001 From: Chris Duncan Date: Wed, 12 Mar 2025 14:15:55 -0700 Subject: [PATCH] Refactor initialization to be extremely clear about how it is configured and why. --- src/shaders/compute.wgsl | 97 +++++++++++++++++++++++++++------------- 1 file changed, 67 insertions(+), 30 deletions(-) diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl index ba68f9e..e93382b 100644 --- a/src/shaders/compute.wgsl +++ b/src/shaders/compute.wgsl @@ -21,11 +21,64 @@ struct WORK { @group(0) @binding(1) var work: WORK; /** -* -* Numeric literal used in the finalization digest is the original value of the -* first element of the initialization vector:`blake2b_IV[0]`. +* Initialization vector defined by BLAKE2. Each vec2 represents two halves +* of the original u64 value from the reference implementation. They appear +* reversed pairwise as defined below, but this is an illusion due to endianness: +* the `x` component of the vector is the low bits and the `y` component is the +* high bits, and if you laid the bits out individually, they would match the +* little-endian 64-bit representation. */ -const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u); +const BLAKE2B_IV = array, 8>( + vec2(0xF3BCC908u, 0x6A09E667u), + vec2(0x84CAA73Bu, 0xBB67AE85u), + vec2(0xFE94F82Bu, 0x3C6EF372u), + vec2(0x5F1D36F1u, 0xA54FF53Au), + vec2(0xADE682D1u, 0x510E527Fu), + vec2(0x2B3E6C1Fu, 0x9B05688Cu), + vec2(0xFB41BD6Bu, 0x1F83D9ABu), + vec2(0x137E2179u, 0x5BE0CD19u) +); + +/** +* Parameter block as defined in BLAKE2 section 2.8 and configured as follows: +* maximal depth = 1, fanout = 1, digest byte length = 8 +*/ +const BLAKE2B_PARAM = vec2(0x01010008u, 0u); + +/** +* Message input length which is always 40 for Nano. +* 8 nonce bytes + 32 block hash bytes +*/ +const BLAKE2B_INLEN = vec2(0x00000028u, 0u); + +/** +* Finalization flag as defined in BLAKE2 section 2.4 and set to ~0 since this is +* the final (and only) message block being hashed. +*/ +const BLAKE2B_FINAL = vec2(0xFFFFFFFFu, 0xFFFFFFFFu); + +/** +* Fully initialized state array that is locally copied at each thread start. +* Application of each XOR is defined by BLAKE2 section 2.4 compression function. +*/ +const BLAKE2B_INIT = array, 16>( + BLAKE2B_IV[0u] ^ BLAKE2B_PARAM, + BLAKE2B_IV[1u], + BLAKE2B_IV[2u], + BLAKE2B_IV[3u], + BLAKE2B_IV[4u], + BLAKE2B_IV[5u], + BLAKE2B_IV[6u], + BLAKE2B_IV[7u], + BLAKE2B_IV[0u], + BLAKE2B_IV[1u], + BLAKE2B_IV[2u], + BLAKE2B_IV[3u], + BLAKE2B_IV[4u] ^ BLAKE2B_INLEN, + BLAKE2B_IV[5u], + BLAKE2B_IV[6u] ^ BLAKE2B_FINAL, + BLAKE2B_IV[7u] +); /** * Used to fill partial `m` vec4 constructions. @@ -97,32 +150,16 @@ fn main(id: vec3) { let m4: vec2 = ubo.blockhash[1u].zw; /** - * Compression buffer initialized to 2 instances of initialization vector. Each - * vec2 represents two halves of the original u64 value from the reference - * implementation. They appear reversed pairwise as defined below, but this is - * an illusion due to endianness: the `x` component of the vector is the low - * bits and the `y` component is the high bits, and if you laid the bits out - * individually, they would match the little-endian 64-bit representation. - * - * The following values have been modified from the BLAKE2B_IV: - * - * OUTLEN is constant 8 bytes - * v0.x ^= 0x01010000u ^ u32(OUTLEN); - * - * INLEN is constant 40 bytes: work value (8) + block hash (32) - * vC.x ^= u32(INLEN); - * - * It is always the "last" compression at this INLEN - * vE = ~vE; + * Compression buffer copied from the modified initialization vector. */ - var v01: vec4 = vec4(BLAKE2B_IV_0, 0x84CAA73Bu, 0xBB67AE85u); - var v23: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); - var v45: vec4 = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); - var v67: vec4 = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u); - var v89: vec4 = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u); - var vAB: vec4 = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au); - var vCD: vec4 = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu); - var vEF: vec4 = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u); + var v01: vec4 = vec4(BLAKE2B_INIT[0u], BLAKE2B_INIT[1u]); + var v23: vec4 = vec4(BLAKE2B_INIT[2u], BLAKE2B_INIT[3u]); + var v45: vec4 = vec4(BLAKE2B_INIT[4u], BLAKE2B_INIT[5u]); + var v67: vec4 = vec4(BLAKE2B_INIT[6u], BLAKE2B_INIT[7u]); + var v89: vec4 = vec4(BLAKE2B_INIT[8u], BLAKE2B_INIT[9u]); + var vAB: vec4 = vec4(BLAKE2B_INIT[10u], BLAKE2B_INIT[11u]); + var vCD: vec4 = vec4(BLAKE2B_INIT[12u], BLAKE2B_INIT[13u]); + var vEF: vec4 = vec4(BLAKE2B_INIT[14u], BLAKE2B_INIT[15u]); /** * Temporary variables used for subprocesses i=4 through i=7 @@ -1624,7 +1661,7 @@ fn main(id: vec3) { * Set nonce if it passes the threshold and no other thread has set it. * Only high bits are needed for comparison since threshold low bits are zero. */ - if ((BLAKE2B_IV_0.y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) { + if ((BLAKE2B_IV[0u].y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) { atomicStore(&work.found, 1u); work.nonce = m0; } -- 2.34.1