@group(0) @binding(1) var<storage, read_write> work: WORK;
/**
-*
-* Numeric literal used in the finalization digest is the original value of the
-* first element of the initialization vector:`blake2b_IV[0]`.
+* Initialization vector defined by BLAKE2. Each vec2<u32> represents two halves
+* of the original u64 value from the reference implementation. They appear
+* reversed pairwise as defined below, but this is an illusion due to endianness:
+* the `x` component of the vector is the low bits and the `y` component is the
+* high bits, and if you laid the bits out individually, they would match the
+* little-endian 64-bit representation.
*/
-const BLAKE2B_IV_0 = vec2(0xF2BDC900u, 0x6A09E667u);
+const BLAKE2B_IV = array<vec2<u32>, 8>(
+ vec2<u32>(0xF3BCC908u, 0x6A09E667u),
+ vec2<u32>(0x84CAA73Bu, 0xBB67AE85u),
+ vec2<u32>(0xFE94F82Bu, 0x3C6EF372u),
+ vec2<u32>(0x5F1D36F1u, 0xA54FF53Au),
+ vec2<u32>(0xADE682D1u, 0x510E527Fu),
+ vec2<u32>(0x2B3E6C1Fu, 0x9B05688Cu),
+ vec2<u32>(0xFB41BD6Bu, 0x1F83D9ABu),
+ vec2<u32>(0x137E2179u, 0x5BE0CD19u)
+);
+
+/**
+* Parameter block as defined in BLAKE2 section 2.8 and configured as follows:
+* maximal depth = 1, fanout = 1, digest byte length = 8
+*/
+const BLAKE2B_PARAM = vec2<u32>(0x01010008u, 0u);
+
+/**
+* Message input length which is always 40 for Nano.
+* 8 nonce bytes + 32 block hash bytes
+*/
+const BLAKE2B_INLEN = vec2<u32>(0x00000028u, 0u);
+
+/**
+* Finalization flag as defined in BLAKE2 section 2.4 and set to ~0 since this is
+* the final (and only) message block being hashed.
+*/
+const BLAKE2B_FINAL = vec2<u32>(0xFFFFFFFFu, 0xFFFFFFFFu);
+
+/**
+* Fully initialized state array that is locally copied at each thread start.
+* Application of each XOR is defined by BLAKE2 section 2.4 compression function.
+*/
+const BLAKE2B_INIT = array<vec2<u32>, 16>(
+ BLAKE2B_IV[0u] ^ BLAKE2B_PARAM,
+ BLAKE2B_IV[1u],
+ BLAKE2B_IV[2u],
+ BLAKE2B_IV[3u],
+ BLAKE2B_IV[4u],
+ BLAKE2B_IV[5u],
+ BLAKE2B_IV[6u],
+ BLAKE2B_IV[7u],
+ BLAKE2B_IV[0u],
+ BLAKE2B_IV[1u],
+ BLAKE2B_IV[2u],
+ BLAKE2B_IV[3u],
+ BLAKE2B_IV[4u] ^ BLAKE2B_INLEN,
+ BLAKE2B_IV[5u],
+ BLAKE2B_IV[6u] ^ BLAKE2B_FINAL,
+ BLAKE2B_IV[7u]
+);
/**
* Used to fill partial `m` vec4 constructions.
let m4: vec2<u32> = ubo.blockhash[1u].zw;
/**
- * Compression buffer initialized to 2 instances of initialization vector. Each
- * vec2<u32> represents two halves of the original u64 value from the reference
- * implementation. They appear reversed pairwise as defined below, but this is
- * an illusion due to endianness: the `x` component of the vector is the low
- * bits and the `y` component is the high bits, and if you laid the bits out
- * individually, they would match the little-endian 64-bit representation.
- *
- * The following values have been modified from the BLAKE2B_IV:
- *
- * OUTLEN is constant 8 bytes
- * v0.x ^= 0x01010000u ^ u32(OUTLEN);
- *
- * INLEN is constant 40 bytes: work value (8) + block hash (32)
- * vC.x ^= u32(INLEN);
- *
- * It is always the "last" compression at this INLEN
- * vE = ~vE;
+ * Compression buffer copied from the modified initialization vector.
*/
- var v01: vec4<u32> = vec4(BLAKE2B_IV_0, 0x84CAA73Bu, 0xBB67AE85u);
- var v23: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
- var v45: vec4<u32> = vec4(0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
- var v67: vec4<u32> = vec4(0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u);
- var v89: vec4<u32> = vec4(0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u);
- var vAB: vec4<u32> = vec4(0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au);
- var vCD: vec4<u32> = vec4(0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu);
- var vEF: vec4<u32> = vec4(0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u);
+ var v01: vec4<u32> = vec4<u32>(BLAKE2B_INIT[0u], BLAKE2B_INIT[1u]);
+ var v23: vec4<u32> = vec4<u32>(BLAKE2B_INIT[2u], BLAKE2B_INIT[3u]);
+ var v45: vec4<u32> = vec4<u32>(BLAKE2B_INIT[4u], BLAKE2B_INIT[5u]);
+ var v67: vec4<u32> = vec4<u32>(BLAKE2B_INIT[6u], BLAKE2B_INIT[7u]);
+ var v89: vec4<u32> = vec4<u32>(BLAKE2B_INIT[8u], BLAKE2B_INIT[9u]);
+ var vAB: vec4<u32> = vec4<u32>(BLAKE2B_INIT[10u], BLAKE2B_INIT[11u]);
+ var vCD: vec4<u32> = vec4<u32>(BLAKE2B_INIT[12u], BLAKE2B_INIT[13u]);
+ var vEF: vec4<u32> = vec4<u32>(BLAKE2B_INIT[14u], BLAKE2B_INIT[15u]);
/**
* Temporary variables used for subprocesses i=4 through i=7
* Set nonce if it passes the threshold and no other thread has set it.
* Only high bits are needed for comparison since threshold low bits are zero.
*/
- if ((BLAKE2B_IV_0.y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) {
+ if ((BLAKE2B_IV[0u].y ^ v01.y ^ v89.y) >= ubo.threshold && atomicLoad(&work.found) == 0u) {
atomicStore(&work.found, 1u);
work.nonce = m0;
}