From: Chris Duncan <chris@zoso.dev>
Date: Sun, 26 Jan 2025 08:20:52 +0000 (-0800)
Subject: Overhaul NanoPowGl to use vector operations which greatly simplifies the Blake2b... 
X-Git-Tag: v3.0.0~67
X-Git-Url: https://zoso.dev/?a=commitdiff_plain;h=2c4aca44635aaf0d82ea3079c0201e1f40db6550;p=nano-pow.git

Overhaul NanoPowGl to use vector operations which greatly simplifies the Blake2b algorithm. Simplify nonce generation and work output by taking advantage of WebGL2 32-bit vector RGBA pixel type options. Fix validation in NanoPowGl by restricting result to passed nonce. Clean up unused variables. Minor test page fixes.
---

diff --git a/src/classes/gl.ts b/src/classes/gl.ts
index fe3d56c..fc67d06 100644
--- a/src/classes/gl.ts
+++ b/src/classes/gl.ts
@@ -10,24 +10,18 @@ export class NanoPowGl {
 	/** Used to set canvas size. Must be a multiple of 256. */
 	static #WORKLOAD: number = 256 * Math.max(1, Math.floor(navigator.hardwareConcurrency))
 
-	static #hexify (arr: number[] | Uint8Array): string {
-		let out = ''
-		for (let i = arr.length - 1; i >= 0; i--) {
-			out += arr[i].toString(16).padStart(2, '0')
-		}
-		return out
-	}
-
 	static #gl: WebGL2RenderingContext | null
 	static #program: WebGLProgram | null
 	static #vertexShader: WebGLShader | null
 	static #fragmentShader: WebGLShader | null
+	static #texture: WebGLTexture | null
+	static #framebuffer: WebGLFramebuffer | null
 	static #positionBuffer: WebGLBuffer | null
 	static #uvBuffer: WebGLBuffer | null
 	static #uboBuffer: WebGLBuffer | null
 	static #workBuffer: WebGLBuffer | null
 	static #query: WebGLQuery | null
-	static #pixels: Uint8Array
+	static #pixels: Uint32Array
 	/**Vertex Positions, 2 triangles */
 	static #positions = new Float32Array([
 		-1, -1, 0, -1, 1, 0, 1, 1, 0,
@@ -76,17 +70,33 @@ export class NanoPowGl {
 			const triangleArray = this.#gl.createVertexArray()
 			this.#gl.bindVertexArray(triangleArray)
 
+			this.#texture = this.#gl.createTexture()
+			this.#gl.bindTexture(this.#gl.TEXTURE_2D, this.#texture)
+			this.#gl.texImage2D(this.#gl.TEXTURE_2D, 0, this.#gl.RGBA32UI, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, 0, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, null)
+			this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MIN_FILTER, this.#gl.NEAREST)
+			this.#gl.texParameteri(this.#gl.TEXTURE_2D, this.#gl.TEXTURE_MAG_FILTER, this.#gl.NEAREST)
+			this.#gl.bindTexture(this.#gl.TEXTURE_2D, null)
+
+			this.#framebuffer = this.#gl.createFramebuffer()
+			this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
+			this.#gl.framebufferTexture2D(this.#gl.FRAMEBUFFER, this.#gl.COLOR_ATTACHMENT0, this.#gl.TEXTURE_2D, this.#texture, 0)
+			if (this.#gl.checkFramebufferStatus(this.#gl.FRAMEBUFFER) !== this.#gl.FRAMEBUFFER_COMPLETE)
+				throw new Error(`Failed to create framebuffer`)
+			this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
+
 			this.#positionBuffer = this.#gl.createBuffer()
 			this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#positionBuffer)
 			this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#positions, this.#gl.STATIC_DRAW)
 			this.#gl.vertexAttribPointer(0, 3, this.#gl.FLOAT, false, 0, 0)
 			this.#gl.enableVertexAttribArray(0)
+			this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null)
 
 			this.#uvBuffer = this.#gl.createBuffer()
 			this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, this.#uvBuffer)
 			this.#gl.bufferData(this.#gl.ARRAY_BUFFER, this.#uvPosArray, this.#gl.STATIC_DRAW)
 			this.#gl.vertexAttribPointer(1, 2, this.#gl.FLOAT, false, 0, 0)
 			this.#gl.enableVertexAttribArray(1)
+			this.#gl.bindBuffer(this.#gl.ARRAY_BUFFER, null)
 
 			this.#uboBuffer = this.#gl.createBuffer()
 			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#uboBuffer)
@@ -94,6 +104,7 @@ export class NanoPowGl {
 			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
 			this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 0, this.#uboBuffer)
 			this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'UBO'), 0)
+			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
 
 			this.#workBuffer = this.#gl.createBuffer()
 			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer)
@@ -101,8 +112,9 @@ export class NanoPowGl {
 			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
 			this.#gl.bindBufferBase(this.#gl.UNIFORM_BUFFER, 1, this.#workBuffer)
 			this.#gl.uniformBlockBinding(this.#program, this.#gl.getUniformBlockIndex(this.#program, 'WORK'), 1)
+			this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
 
-			this.#pixels = new Uint8Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4)
+			this.#pixels = new Uint32Array(this.#gl.drawingBufferWidth * this.#gl.drawingBufferHeight * 4)
 			this.#query = this.#gl.createQuery()
 		} catch (err) {
 			throw new Error(`WebGL initialization failed. ${err}`)
@@ -117,6 +129,8 @@ export class NanoPowGl {
 		NanoPowGl.#uboBuffer = null
 		NanoPowGl.#uvBuffer = null
 		NanoPowGl.#positionBuffer = null
+		NanoPowGl.#framebuffer = null
+		NanoPowGl.#texture = null
 		NanoPowGl.#fragmentShader = null
 		NanoPowGl.#vertexShader = null
 		NanoPowGl.#program = null
@@ -157,11 +171,13 @@ export class NanoPowGl {
 
 		/** Upload work buffer */
 		this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, this.#workBuffer)
-		this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, Uint32Array.from(work))
+		this.#gl.bufferSubData(this.#gl.UNIFORM_BUFFER, 0, work)
 		this.#gl.bindBuffer(this.#gl.UNIFORM_BUFFER, null)
 
 		this.#gl.beginQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE, this.#query)
+		this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
 		this.#gl.drawArrays(this.#gl.TRIANGLES, 0, 6)
+		this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
 		this.#gl.endQuery(this.#gl.ANY_SAMPLES_PASSED_CONSERVATIVE)
 	}
 
@@ -189,22 +205,18 @@ export class NanoPowGl {
 	* byte, converts the subsequent 3 pixels with the nonce byte values to a hex
 	* string, and returns the result.
 	*
-	* @param workBytes - Buffer with the original random nonce value
 	* @param workHex - Original nonce if provided for a validation call
 	* @returns Nonce as an 8-byte (16-char) hexadecimal string
 	*/
-	static #readResult (workBytes: Uint8Array, workHex?: string): string {
+	static #readResult (workHex?: string): string {
 		if (this.#gl == null) throw new Error('WebGL 2 is required to read pixels')
-		this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA, this.#gl.UNSIGNED_BYTE, this.#pixels)
+		this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, this.#framebuffer)
+		this.#gl.readPixels(0, 0, this.#gl.drawingBufferWidth, this.#gl.drawingBufferHeight, this.#gl.RGBA_INTEGER, this.#gl.UNSIGNED_INT, this.#pixels)
+		this.#gl.bindFramebuffer(this.#gl.FRAMEBUFFER, null)
 		for (let i = 0; i < this.#pixels.length; i += 4) {
 			if (this.#pixels[i] !== 0) {
 				/** Return the work value with the custom bits */
-				const hex = this.#hexify(workBytes.subarray(4, 8)) + this.#hexify([
-					this.#pixels[i + 2],
-					this.#pixels[i + 3],
-					workBytes[2] ^ (this.#pixels[i] - 1),
-					workBytes[3] ^ (this.#pixels[i + 1] - 1)
-				])
+				const hex = `${this.#pixels[i+1].toString(16).padStart(8, '0')}${this.#pixels[i+2].toString(16).padStart(8, '0')}`
 				if (workHex == null || workHex == hex) return hex
 			}
 		}
@@ -233,7 +245,7 @@ export class NanoPowGl {
 		const threshold = (typeof options?.threshold !== 'number' || options.threshold < 0x0 || options.threshold > 0xffffffff)
 			? 0xfffffff8
 			: options.threshold
-		const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x10)
+		const effort = (typeof options?.effort !== 'number' || options.effort < 0x1 || options.effort > 0x20)
 			? 0x8
 			: options.effort
 		const debug = !!(options?.debug)
@@ -268,7 +280,7 @@ export class NanoPowGl {
 			const found = await this.#checkQueryResult()
 			times.push(performance.now() - start)
 			if (found) {
-				nonce = this.#readResult(seed)
+				nonce = this.#readResult()
 			}
 		}
 		this.#busy = false
@@ -302,6 +314,12 @@ export class NanoPowGl {
 			: options.threshold
 		const debug = !!(options?.debug)
 
+		/** Reset if user specified new level of effort */
+		if (this.#WORKLOAD !== 1) {
+			this.#WORKLOAD = 1
+			this.reset()
+		}
+
 		/** Set up uniform buffer object */
 		const uboView = new DataView(new ArrayBuffer(144))
 		for (let i = 0; i < 64; i += 8) {
@@ -323,7 +341,7 @@ export class NanoPowGl {
 		let found = await this.#checkQueryResult()
 		if (found) {
 			try {
-				nonce = this.#readResult(seed, work)
+				nonce = this.#readResult(work)
 			} catch (err) {
 				found = false
 			}
diff --git a/src/classes/index.ts b/src/classes/index.ts
index d880852..7908c6f 100644
--- a/src/classes/index.ts
+++ b/src/classes/index.ts
@@ -16,6 +16,7 @@ try {
 	await NanoPowGl.init()
 	isGlSupported = true
 } catch (err) {
+	console.error(err)
 	console.warn(`WebGL is not supported in this environment.`)
 	isGlSupported = false
 }
diff --git a/src/shaders/gl-fragment.ts b/src/shaders/gl-fragment.ts
index 78c22ef..1bed519 100644
--- a/src/shaders/gl-fragment.ts
+++ b/src/shaders/gl-fragment.ts
@@ -5,10 +5,9 @@
 export const NanoPowGlFragmentShader = `#version 300 es
 #pragma vscode_glsllint_stage: frag
 precision highp float;
-precision highp int;
 
 in vec2 uv_pos;
-out vec4 fragColor;
+out uvec4 nonce;
 
 // blockhash - array of precalculated block hash components
 // threshold - 0xfffffff8 for send/change blocks, 0xfffffe00 for all else
@@ -20,17 +19,21 @@ layout(std140) uniform UBO {
 };
 
 // Random work values
-// First 2 bytes will be overwritten by texture pixel position
-// Second 2 bytes will be modified if the canvas size is greater than 256x256
-// Last 4 bytes remain as generated externally
 layout(std140) uniform WORK {
-	uvec4 work[2];
+	uvec2 work;
 };
 
 // Defined separately from uint v[32] below as the original value is required
 // to calculate the second uint32 of the digest for threshold comparison
 const uint BLAKE2B_IV32_1 = 0x6A09E667u;
 
+// Used during G for vector bit rotations
+const uvec2 ROTATE_1 = uvec2(1u, 1u);
+const uvec2 ROTATE_8 = uvec2(8u, 8u);
+const uvec2 ROTATE_16 = uvec2(16u, 16u);
+const uvec2 ROTATE_24 = uvec2(24u, 24u);
+const uvec2 ROTATE_31 = uvec2(31u, 31u);
+
 // Both buffers represent 16 uint64s as 32 uint32s
 // because that's what GLSL offers, just like Javascript
 
@@ -43,143 +46,85 @@ const uint BLAKE2B_IV32_1 = 0x6A09E667u;
 // It's always the "last" compression at this INLEN
 // v[28] = ~v[28];
 // v[29] = ~v[29];
-uint v[32] = uint[32](
-	0xF2BDC900u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
-	0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
-	0xADE682D1u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
-	0xFB41BD6Bu, 0x1F83D9ABu, 0x137E2179u, 0x5BE0CD19u,
-	0xF3BCC908u, 0x6A09E667u, 0x84CAA73Bu, 0xBB67AE85u,
-	0xFE94F82Bu, 0x3C6EF372u, 0x5F1D36F1u, 0xA54FF53Au,
-	0xADE682F9u, 0x510E527Fu, 0x2B3E6C1Fu, 0x9B05688Cu,
-	0x04BE4294u, 0xE07C2654u, 0x137E2179u, 0x5BE0CD19u
+uvec2 v[16] = uvec2[16](
+	uvec2(0xF2BDC900u, 0x6A09E667u),
+	uvec2(0x84CAA73Bu, 0xBB67AE85u),
+	uvec2(0xFE94F82Bu, 0x3C6EF372u),
+	uvec2(0x5F1D36F1u, 0xA54FF53Au),
+	uvec2(0xADE682D1u, 0x510E527Fu),
+	uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
+	uvec2(0xFB41BD6Bu, 0x1F83D9ABu),
+	uvec2(0x137E2179u, 0x5BE0CD19u),
+	uvec2(0xF3BCC908u, 0x6A09E667u),
+	uvec2(0x84CAA73Bu, 0xBB67AE85u),
+	uvec2(0xFE94F82Bu, 0x3C6EF372u),
+	uvec2(0x5F1D36F1u, 0xA54FF53Au),
+	uvec2(0xADE682F9u, 0x510E527Fu),
+	uvec2(0x2B3E6C1Fu, 0x9B05688Cu),
+	uvec2(0x04BE4294u, 0xE07C2654u),
+	uvec2(0x137E2179u, 0x5BE0CD19u)
 );
-// Input data buffer
-uint m[32];
 
-// These are offsets into the input data buffer for each mixing step.
-// They are multiplied by 2 from the original SIGMA values in
-// the C reference implementation, which refered to uint64s.
-const uint SIGMA82[192] = uint[192](
-	0u,2u,4u,6u,8u,10u,12u,14u,16u,18u,20u,22u,24u,26u,28u,30u,
-	28u,20u,8u,16u,18u,30u,26u,12u,2u,24u,0u,4u,22u,14u,10u,6u,
-	22u,16u,24u,0u,10u,4u,30u,26u,20u,28u,6u,12u,14u,2u,18u,8u,
-	14u,18u,6u,2u,26u,24u,22u,28u,4u,12u,10u,20u,8u,0u,30u,16u,
-	18u,0u,10u,14u,4u,8u,20u,30u,28u,2u,22u,24u,12u,16u,6u,26u,
-	4u,24u,12u,20u,0u,22u,16u,6u,8u,26u,14u,10u,30u,28u,2u,18u,
-	24u,10u,2u,30u,28u,26u,8u,20u,0u,14u,12u,6u,18u,4u,16u,22u,
-	26u,22u,14u,28u,24u,2u,6u,18u,10u,0u,30u,8u,16u,12u,4u,20u,
-	12u,30u,28u,18u,22u,6u,0u,16u,24u,4u,26u,14u,2u,8u,20u,10u,
-	20u,4u,16u,8u,14u,12u,2u,10u,30u,22u,18u,28u,6u,24u,26u,0u,
-	0u,2u,4u,6u,8u,10u,12u,14u,16u,18u,20u,22u,24u,26u,28u,30u,
-	28u,20u,8u,16u,18u,30u,26u,12u,2u,24u,0u,4u,22u,14u,10u,6u
+// Input data buffer
+uvec2 m[16];
+
+// Offsets into the input data buffer for each mixing step
+const uint SIGMA[192] = uint[192](
+	0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
+	14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u,
+	11u,8u,12u,0u,5u,2u,15u,13u,10u,14u,3u,6u,7u,1u,9u,4u,
+	7u,9u,3u,1u,13u,12u,11u,14u,2u,6u,5u,10u,4u,0u,15u,8u,
+	9u,0u,5u,7u,2u,4u,10u,15u,14u,1u,11u,12u,6u,8u,3u,13u,
+	2u,12u,6u,10u,0u,11u,8u,3u,4u,13u,7u,5u,15u,14u,1u,9u,
+	12u,5u,1u,15u,14u,13u,4u,10u,0u,7u,6u,3u,9u,2u,8u,11u,
+	13u,11u,7u,14u,12u,1u,3u,9u,5u,0u,15u,4u,8u,6u,2u,10u,
+	6u,15u,14u,9u,11u,3u,0u,8u,12u,2u,13u,7u,1u,4u,10u,5u,
+	10u,2u,8u,4u,7u,6u,1u,5u,15u,11u,9u,14u,3u,12u,13u,0u,
+	0u,1u,2u,3u,4u,5u,6u,7u,8u,9u,10u,11u,12u,13u,14u,15u,
+	14u,10u,4u,8u,9u,15u,13u,6u,1u,12u,0u,2u,11u,7u,5u,3u
 );
 
 // G mixing function
-void G (uint ix, uint iy, uint a, uint b, uint c, uint d) {
-	uint carry;
-	uint xor0;
-	uint xor1;
-
-	// a = a + b;
-	v[a] = v[a] + v[b];
-	carry = uint(v[a] < v[b]);
-	v[a+1u] = v[a+1u] + v[b+1u] + carry;
-
-	// a = a + m[sigma[r][2*i+0]];
-	v[a] = v[a] + m[ix];
-	carry = uint(v[a] < m[ix]);
-	v[a+1u] = v[a+1u] + m[ix+1u] + carry;
-
-	// d = rotr64(d ^ a, 32);
-	xor0 = v[d] ^ v[a];
-	xor1 = v[d+1u] ^ v[a+1u];
-	v[d] = xor1;
-	v[d+1u] = xor0;
-
-	// c = c + d;
-	v[c] = v[c] + v[d];
-	carry = uint(v[c] < v[d]);
-	v[c+1u] = v[c+1u] + v[d+1u] + carry;
-
-	// b = rotr64(b ^ c, 24);
-	xor0 = v[b] ^ v[c];
-	xor1 = v[b+1u] ^ v[c+1u];
-	v[b] = (xor0 >> 24u) ^ (xor1 << 8u);
-	v[b+1u] = (xor1 >> 24u) ^ (xor0 << 8u);
-
-	// a = a + b;
-	v[a] = v[a] + v[b];
-	carry = uint(v[a] < v[b]);
-	v[a+1u] = v[a+1u] + v[b+1u] + carry;
-
-	// a = a + m[sigma[r][2*i+0]];
-	v[a] = v[a] + m[iy];
-	carry = uint(v[a] < m[iy]);
-	v[a+1u] = v[a+1u] + m[iy+1u] + carry;
-
-	// d = rotr64(d ^ a, 16)
-	xor0 = v[d] ^ v[a];
-	xor1 = v[d+1u] ^ v[a+1u];
-	v[d] = (xor0 >> 16u) ^ (xor1 << 16u);
-	v[d+1u] = (xor1 >> 16u) ^ (xor0 << 16u);
-
-	// c = c + d;
-	v[c] = v[c] + v[d];
-	carry = uint(v[c] < v[d]);
-	v[c+1u] = v[c+1u] + v[d+1u] + carry;
-
-	// b = rotr64(b ^ c, 63)
-	xor0 = v[b] ^ v[c];
-	xor1 = v[b+1u] ^ v[c+1u];
-	v[b] = (xor1 >> 31u) ^ (xor0 << 1u);
-	v[b+1u] = (xor0 >> 31u) ^ (xor1 << 1u);
+void G (uint a, uint b, uint c, uint d, uint x, uint y) {
+	v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
+	v[a] = v[a] + m[x] + uvec2(0u, uint(v[a].x + m[x].x < m[x].x));
+	v[d] = (v[d] ^ v[a]).yx;
+	v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
+	v[b] = ((v[b] ^ v[c]) >> ROTATE_24) | ((v[b] ^ v[c]).yx << ROTATE_8);
+	v[a] = v[a] + v[b] + uvec2(0u, uint(v[a].x + v[b].x < v[b].x));
+	v[a] = v[a] + m[y] + uvec2(0u, uint(v[a].x + m[y].x < m[y].x));
+	v[d] = ((v[d] ^ v[a]) >> ROTATE_16) | ((v[d] ^ v[a]).yx << ROTATE_16);
+	v[c] = v[c] + v[d] + uvec2(0u, uint(v[c].x + v[d].x < v[d].x));
+	v[b] = ((v[b] ^ v[c]).yx >> ROTATE_31) | ((v[b] ^ v[c]) << ROTATE_1);
 }
 
 void main() {
-	int i;
-	uvec4 u_work0 = work[0u];
-	uvec4 u_work1 = work[1u];
-	uint uv_x = uint(uv_pos.x * workload);
-	uint uv_y = uint(uv_pos.y * workload);
-	uint x_pos = uv_x % 256u;
-	uint y_pos = uv_y % 256u;
-	uint x_index = (uv_x - x_pos) / 256u;
-	uint y_index = (uv_y - y_pos) / 256u;
-
-	// First 2 work bytes are the x,y pos within the 256x256 area, the next
-	// two bytes are modified from the random generated value, XOR'd with
-	// the x,y area index of where this pixel is located
-	m[0u] = (x_pos ^ (y_pos << 8u) ^ ((u_work0.b ^ x_index) << 16u) ^ ((u_work0.a ^ y_index) << 24u));
-
-	// Remaining bytes are un-modified from the random generated value
-	m[1u] = (u_work1.r ^ (u_work1.g << 8u) ^ (u_work1.b << 16u) ^ (u_work1.a << 24u));
+	// Nonce uniquely differentiated by pixel location
+	m[0u].x = work.x ^ uint(uv_pos.x * workload);
+	m[0u].y = work.y ^ uint(uv_pos.y * workload);
 
 	// Block hash
-	for (uint i = 0u; i < 8u; i = i + 1u) {
-		m[i+2u] = blockhash[i];
-	}
+	m[1u] = uvec2(blockhash[0u], blockhash[1u]);
+	m[2u] = uvec2(blockhash[2u], blockhash[3u]);
+	m[3u] = uvec2(blockhash[4u], blockhash[5u]);
+	m[4u] = uvec2(blockhash[6u], blockhash[7u]);
 
 	// twelve rounds of mixing
 	for(uint i = 0u; i < 12u; i = i + 1u) {
-		G(SIGMA82[i * 16u + 0u], SIGMA82[i * 16u + 1u], 0u, 8u, 16u, 24u);
-		G(SIGMA82[i * 16u + 2u], SIGMA82[i * 16u + 3u], 2u, 10u, 18u, 26u);
-		G(SIGMA82[i * 16u + 4u], SIGMA82[i * 16u + 5u], 4u, 12u, 20u, 28u);
-		G(SIGMA82[i * 16u + 6u], SIGMA82[i * 16u + 7u], 6u, 14u, 22u, 30u);
-		G(SIGMA82[i * 16u + 8u], SIGMA82[i * 16u + 9u], 0u, 10u, 20u, 30u);
-		G(SIGMA82[i * 16u + 10u], SIGMA82[i * 16u + 11u], 2u, 12u, 22u, 24u);
-		G(SIGMA82[i * 16u + 12u], SIGMA82[i * 16u + 13u], 4u, 14u, 16u, 26u);
-		G(SIGMA82[i * 16u + 14u], SIGMA82[i * 16u + 15u], 6u, 8u, 18u, 28u);
+		G(0u, 4u, 8u, 12u, SIGMA[i * 16u + 0u], SIGMA[i * 16u + 1u]);
+		G(1u, 5u, 9u, 13u, SIGMA[i * 16u + 2u], SIGMA[i * 16u + 3u]);
+		G(2u, 6u, 10u, 14u, SIGMA[i * 16u + 4u], SIGMA[i * 16u + 5u]);
+		G(3u, 7u, 11u, 15u, SIGMA[i * 16u + 6u], SIGMA[i * 16u + 7u]);
+		G(0u, 5u, 10u, 15u, SIGMA[i * 16u + 8u], SIGMA[i * 16u + 9u]);
+		G(1u, 6u, 11u, 12u, SIGMA[i * 16u + 10u], SIGMA[i * 16u + 11u]);
+		G(2u, 7u, 8u, 13u, SIGMA[i * 16u + 12u], SIGMA[i * 16u + 13u]);
+		G(3u, 4u, 9u, 14u, SIGMA[i * 16u + 14u], SIGMA[i * 16u + 15u]);
 	}
 
-	// Pixel data is multipled by threshold test result (0 or 1)
-	// First 4 bytes insignificant, only calculate digest of second 4 bytes
-	if ((BLAKE2B_IV32_1 ^ v[1u] ^ v[17u]) > threshold) {
-		fragColor = vec4(
-			float(x_index + 1u)/255.0, // +1 to distinguish from 0 (unsuccessful) pixels
-			float(y_index + 1u)/255.0, // Same as previous
-			float(x_pos)/255.0, // Return the 2 custom bytes used in work value
-			float(y_pos)/255.0  // Second custom byte
-		);
+	// Pixel data set from work values
+	// Finalize digest from high bits, low bits can be safely ignored
+	if ((BLAKE2B_IV32_1 ^ v[0u].y ^ v[8u].y) > threshold) {
+		nonce = uvec4(1u, m[0].y, m[0].x, 1u);
 	} else {
  		discard;
 	}
diff --git a/test.html b/test.html
index e016fd2..0b5a637 100644
--- a/test.html
+++ b/test.html
@@ -59,9 +59,11 @@ SPDX-License-Identifier: GPL-3.0-or-later
 		}
 
 		export async function run (threshold, size, effort, isOutputShown, isGlForced, isDebug) {
-			if (isGlForced) NanoPow = NanoPowGl
+			NanoPow = isGlForced ? NanoPowGl : NanoPowGpu
+			const type = (NanoPow === NanoPowGpu) ? 'WebGPU' : (NanoPow === NanoPowGl) ? 'WebGL' : 'unknown API'
 			document.getElementById('status').innerHTML = `TESTING IN PROGRESS 0/${size}`
 			console.log(`%cNanoPow`, 'color:green', 'Checking validate()')
+
 			const expectFalse = await NanoPow.validate('0000000000000000', '0000000000000000000000000000000000000000000000000000000000000000')
 			console.log(`validate() output for bad nonce is ${expectFalse === false ? 'correct' : 'incorrect'}`)
 			const expectTrue = await NanoPow.validate('47c83266398728cf', '92BA74A7D6DC7557F3EDA95ADC6341D51AC777A0A6FF0688A5C492AB2B2CB40D')
@@ -70,7 +72,6 @@ SPDX-License-Identifier: GPL-3.0-or-later
 			console.log(`validate() output for slightly wrong nonce is ${expectFalseForVariation === false ? 'correct' : 'incorrect'}`)
 			const expectTrueForCollision = await NanoPow.validate('c5d5d6f7c5d6ccd1', '281e89ac73b1082b464b9c3c1168384f846d39f6df25105f8b4a22915e999117')
 			console.log(`validate() output for colliding nonce is ${expectTrueForCollision === true ? 'correct' : 'incorrect'}`)
-			const type = (NanoPow === NanoPowGpu) ? 'WebGPU' : (NanoPow === NanoPowGl) ? 'WebGL' : 'unknown API'
 
 			try {
 				if (expectFalse || !expectTrue) throw new Error(`Validation is not working`)