From 3b1462830b94d2ba66b3fd8eaf25a945fd8c6a52 Mon Sep 17 00:00:00 2001
From: Chris Duncan <chris@zoso.dev>
Date: Wed, 12 Mar 2025 14:15:55 -0700
Subject: [PATCH] Refactor initialization to be extremely clear about how it is
 configured and why.

---
 src/shaders/compute.wgsl | 97 +++++++++++++++++++++++++++-------------
 1 file changed, 67 insertions(+), 30 deletions(-)
diff --git a/src/shaders/compute.wgsl b/src/shaders/compute.wgsl
index ba68f9e..e93382b 100644
--- a/src/shaders/compute.wgsl
+++ b/src/shaders/compute.wgsl
@@ -21,11 +21,64 @@ struct WORK {
 @group(0) @binding(1) var<storage, read_write> work: WORK;
 
 /**
-*
-* Numeric literal used in the finalization digest is the original value of the
-* first element of the initialization vector:`blake2b_IV[0]`.
+* Initialization vector defined by BLAKE2. Each vec2<u32> represents two halves
+* of the original u64 value from the reference implementation. They appear
+* reversed pairwise as defined below, but this is an illusion due to endianness:
+* the `x` component of the vector is the low bits and the `y` component is the
+* high bits, and if you laid the bits out individually, they would match the
+* little-endian 64-bit representation.
 */
-const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u);
+const BLAKE2B_IV = array<vec2<u32>, 8>(
+	vec2<u32>(0xF3BCC908u, 0x6A09E667u),
+	vec2<u32>(0x84CAA73Bu, 0xBB67AE85u),
+	vec2<u32>(0xFE94F82Bu, 0x3C6EF372u),
+	vec2<u32>(0x5F1D36F1u, 0xA54FF53Au),
+	vec2<u32>(0xADE682D1u, 0x510E527Fu),
+	vec2<u32>(0x2B3E6C1Fu, 0x9B05688Cu),
+	vec2<u32>(0xFB41BD6Bu, 0x1F83D9ABu),
+	vec2<u32>(0x137E2179u, 0x5BE0CD19u)
+);
+
+/**
+* Parameter block as defined in BLAKE2 section 2.8 and configured as follows:
+* maximal depth = 1, fanout = 1, digest byte length = 8
+*/
+const BLAKE2B_PARAM = vec2<u32>(0x01010008u, 0u);
+
+/**
+* Message input length which is always 40 for Nano.
+* 8 nonce bytes + 32 block hash bytes
+*/
+const BLAKE2B_INLEN = vec2<u32>(0x00000028u, 0u);
+
+/**
+* Finalization flag as defined in BLAKE2 section 2.4 and set to ~0 since this is
+* the final (and only) message block being hashed.
+*/
+const BLAKE2B_FINAL = vec2<u32>(0xFFFFFFFFu, 0xFFFFFFFFu);
+
+/**
+* Fully initialized state array that is locally copied at each thread start.
+* Application of each XOR is defined by BLAKE2 section 2.4 compression function.
+*/
+const BLAKE2B_INIT = array<vec2<u32>, 16>(
+	BLAKE2B_IV[0u] ^ BLAKE2B_PARAM,
+	BLAKE2B_IV[1u],
+	BLAKE2B_IV[2u],
+	BLAKE2B_IV[3u],
+	BLAKE2B_IV[4u],
+	BLAKE2B_IV[5u],
+	BLAKE2B_IV[6u],
+	BLAKE2B_IV[7u],
+	BLAKE2B_IV[0u],
+	BLAKE2B_IV[1u],
+	BLAKE2B_IV[2u],
+	BLAKE2B_IV[3u],
+	BLAKE2B_IV[4u] ^ BLAKE2B_INLEN,
+	BLAKE2B_IV[5u],
+	BLAKE2B_IV[6u] ^ BLAKE2B_FINAL,
+	BLAKE2B_IV[7u]
+);
 
 /**
 * Used to fill partial `m` vec4 constructions.
@@ -97,32 +150,16 @@ fn main(id: vec3<u32>) {
 	let m4: vec2<u32> = ubo.blockhash[1u].zw;
 
 	/**
-	* Compression buffer initialized to 2 instances of initialization vector. Each
-	* vec2<u32> represents two halves of the original u64 value from the reference
-	* implementation. They appear reversed pairwise as defined below, but this is
-	* an illusion due to endianness: the `x` component of the vector is the low
-	* bits and the `y` component is the high bits, and if you laid the bits out
-	* individually, they would match the little-endian 64-bit representation.
-	*
-	* The following values have been modified from the BLAKE2B_IV:
-	*
-	* OUTLEN is constant 8 bytes
-	* v0.x ^= 0x01010000u ^ u32(OUTLEN);
-	*
-	* INLEN is constant 40 bytes: work value (8) + block hash (32)
-	* vC.x ^= u32(INLEN);
-	*
-	* It is always the "last" compression at this INLEN
-	* vE = ~vE;
+	* Compression buffer copied from the modified initialization vector.
 	*/
-	var v01: vec4<u32> = vec4(BLAKE2B_IV_0, 0x84CAA73Bu, 0xBB67AE85u);
-	var v23: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
-	var v45: vec4<u32> = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
-	var v67: vec4<u32> = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u);
-	var v89: vec4<u32> = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u);
-	var vAB: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
-	var vCD: vec4<u32> = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
-	var vEF: vec4<u32> = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u);
+	var v01: vec4<u32> = vec4<u32>(BLAKE2B_INIT[0u], BLAKE2B_INIT[1u]);
+	var v23: vec4<u32> = vec4<u32>(BLAKE2B_INIT[2u], BLAKE2B_INIT[3u]);
+	var v45: vec4<u32> = vec4<u32>(BLAKE2B_INIT[4u], BLAKE2B_INIT[5u]);
+	var v67: vec4<u32> = vec4<u32>(BLAKE2B_INIT[6u], BLAKE2B_INIT[7u]);
+	var v89: vec4<u32> = vec4<u32>(BLAKE2B_INIT[8u], BLAKE2B_INIT[9u]);
+	var vAB: vec4<u32> = vec4<u32>(BLAKE2B_INIT[10u], BLAKE2B_INIT[11u]);
+	var vCD: vec4<u32> = vec4<u32>(BLAKE2B_INIT[12u], BLAKE2B_INIT[13u]);
+	var vEF: vec4<u32> = vec4<u32>(BLAKE2B_INIT[14u], BLAKE2B_INIT[15u]);
 
 	/**
 	* Temporary variables used for subprocesses i=4 through i=7
@@ -1624,7 +1661,7 @@ fn main(id: vec3<u32>) {
 	* Set nonce if it passes the threshold and no other thread has set it.
 	* Only high bits are needed for comparison since threshold low bits are zero.
 	*/
-	if ((BLAKE2B_IV_0.y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) {
+	if ((BLAKE2B_IV[0u].y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) {
 		atomicStore(&work.found, 1u);
 		work.nonce = m0;
 	}
-- 
2.34.1